1 Preparation


1.1 Genotyped data set preparation


First we define input and output path shortcuts.

input <- file.path(getwd(), "data", "Input")
output <- file.path(getwd(), "data", "Output")

Then we delete the the output folder and all the files and folders it contains from the previous run.

unlink(output, recursive = TRUE)

The following code chunk consists in the arguments that this R script can take from the user. “verbose” means whether to output to the terminal, “imputated” is if the program should expect an imputated dataset “EP” performs needed data correction before analysis in the Epistasis Project dataset and “ADNI” means if the input dataset also includes the ADNI dataset.

verbose <- FALSE

Then, we proceed to read and store the genotyped input data.

encoded_NA <- c("00", "", "???", "-9")
df <- read.csv2(file.path(input, "genotyped.csv"),
                na.strings = encoded_NA)
str(df, list.len = 30)
## 'data.frame':    2462 obs. of  100 variables:
##  $ ID                   : chr  "ASR382AST5B01" "ASR383AST5B03" "ASR383AST5D01" "ASR384AST5B05" ...
##  $ Plate                : chr  "AST5" "AST5" "AST5" "AST5" ...
##  $ WELL                 : chr  "B01" "B03" "D01" "B05" ...
##  $ CentreCODE           : chr  "RG 728" "RG 733" "RG 606" "RG 743" ...
##  $ Diag                 : chr  "AD" "AD" "AD" "AD" ...
##  $ SexLett              : chr  "Male" "Female" "Male" "Female" ...
##  $ Age                  : int  69 69 81 76 75 75 80 74 71 76 ...
##  $ AgeExam              : num  69 69 81 76 75 75 80 74 71 76 ...
##  $ AgeOnset             : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ AgeDeath             : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ Centre_APOE          : int  33 34 33 33 33 33 23 33 23 33 ...
##  $ Centre               : chr  "OVIEDO" "OVIEDO" "OVIEDO" "OVIEDO" ...
##  $ rs429358             : chr  "TT" "CT" "TT" "TT" ...
##  $ rs7412               : chr  "CC" "CC" "CC" "CC" ...
##  $ LGC_APOE             : int  33 34 33 33 33 33 23 33 23 33 ...
##  $ LGC_E4.              : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ FID                  : chr  "ASR382AST5B01" "ASR383AST5B03" "ASR383AST5D01" "ASR384AST5B05" ...
##  $ IID                  : chr  "ASR382AST5B01" "ASR383AST5B03" "ASR383AST5D01" "ASR384AST5B05" ...
##  $ PID                  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ MID                  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sex                  : int  1 2 1 2 2 1 1 2 2 1 ...
##  $ Dx                   : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ SORT1_rs17585355     : chr  "AA" "AA" "AA" "AA" ...
##  $ SORT1_rs2228604      : chr  "CC" "CC" "AC" "AC" ...
##  $ SORT1_rs7536292      : chr  "TT" "CT" "TT" "TT" ...
##  $ SORT1_rs17646665     : chr  "AA" "AA" "AA" "AA" ...
##  $ SORT1_rs1149175      : chr  "GG" "GG" "GG" "GG" ...
##  $ SORT1_rs10745354     : chr  "TT" "TT" "CT" "CT" ...
##  $ IL10_rs1800896       : chr  "AA" "GA" "GG" "GA" ...
##  $ BIN1_rs744373        : chr  "GG" "GA" "GA" "GG" ...
##   [list output truncated]

Reading the snps that we want to study

vector_of_gene_snps <- scan(file.path(input, "snps.txt"),
                            what = "character",
                            quiet = !verbose)
vector_of_gene_snps
##  [1] "TCN2_rs5749131"     "TCN2_rs5753231"     "TCN2_rs16988828"   
##  [4] "TCN2_rs9606756"     "TCN2_rs757874"      "TCN2_rs1801198"    
##  [7] "TCN2_rs5749135"     "TCN2_rs9621049"     "TCN2_rs1131603"    
## [10] "TCN2_rs7289553"     "TCN2_rs5997711"     "MTRR_rs1801394"    
## [13] "MTRR_rs13181011"    "MTRR_rs326120"      "MTRR_rs1532268"    
## [16] "MTRR_rs7703033"     "MTRR_rs6555501"     "MTRR_rs162035"     
## [19] "MTRR_rs3815743"     "MTRR_rs162040"      "MTRR_rs10475399"   
## [22] "SORT1_rs17585355"   "SORT1_rs2228604"    "SORT1_rs7536292"   
## [25] "SORT1_rs17646665"   "SORT1_rs1149175"    "SORT1_rs10745354"  
## [28] "BDNF_rs6265"        "BDNF_rs11030102"    "BDNF_rs11030104"   
## [31] "BDNF_rs11030119"    "BDNF_rs12288512"    "SHMT1_rs16961153"  
## [34] "SHMT1_rs1979277"    "SHMT1_rs669340"     "SHMT1_rs643333"    
## [37] "SHMT1_rs9901160"    "RFC1_rs11096990"    "RFC1_rs4975002"    
## [40] "RFC1_rs4975009"     "RFC1_rs6851075"     "APOE_rs429358"     
## [43] "APOE_rs7412"        "APOE_rs597668"      "ABCA1_rs1800977"   
## [46] "ABCA1_rs2422493"    "CYP46A1_rs7157609"  "CYP46A1_rs4900442" 
## [49] "DBH_rs1611115"      "DBH_rs1611131"      "GAB2_rs4945261"    
## [52] "GAB2_rs2373115"     "MAPT_rs242557"      "MAPT_rs2471738"    
## [55] "NPC1_rs2236707"     "NPC1_rs4800488"     "NR1H2_rs2695121"   
## [58] "NR1H2_rs1052533"    "NTRK2_rs1187323"    "NTRK2_rs1545285"   
## [61] "PEMT_rs7946"        "PEMT_rs12325817"    "VDR_rs731236"      
## [64] "VDR_rs7975232"      "BIN1_rs744373"      "CD33_rs3865444"    
## [67] "CDK5_rs2069442"     "HMDX1_rs2071746"    "HMGCR_rs3931914"   
## [70] "IL10_rs1800896"     "LRP1_rs1799986"     "MTHFD1L_rs11754661"
## [73] "PRNP_rs1799990"     "SLC19A1_rs1051266"  "TRAK2_rs13022344"
length(vector_of_gene_snps)
## [1] 75

Chosing other variables of interest

variables <- c("ID", "Diag", "SexLett", "Age", "Centre", "LGC_E4.")

Subsetting the dataframe with the desired variables

df <- df[, c(variables, vector_of_gene_snps)]
dim(df)
## [1] 2462   81

Then we check for missingness across all the dataset.

colSums(is.na(df))
##                 ID               Diag            SexLett                Age 
##                 21                 21                 21                 21 
##             Centre            LGC_E4.     TCN2_rs5749131     TCN2_rs5753231 
##                 21                 21                 82                 72 
##    TCN2_rs16988828     TCN2_rs9606756      TCN2_rs757874     TCN2_rs1801198 
##                 47                 56                 66                 62 
##     TCN2_rs5749135     TCN2_rs9621049     TCN2_rs1131603     TCN2_rs7289553 
##                 47                 57                 44                 89 
##     TCN2_rs5997711     MTRR_rs1801394    MTRR_rs13181011      MTRR_rs326120 
##                 65                 61                 42                 80 
##     MTRR_rs1532268     MTRR_rs7703033     MTRR_rs6555501      MTRR_rs162035 
##                 74                 61                 76                 47 
##     MTRR_rs3815743      MTRR_rs162040    MTRR_rs10475399   SORT1_rs17585355 
##                 85                 69                 75                 51 
##    SORT1_rs2228604    SORT1_rs7536292   SORT1_rs17646665    SORT1_rs1149175 
##                 59                 83                 37                 46 
##   SORT1_rs10745354        BDNF_rs6265    BDNF_rs11030102    BDNF_rs11030104 
##                 58                 47                 83                 69 
##    BDNF_rs11030119    BDNF_rs12288512   SHMT1_rs16961153    SHMT1_rs1979277 
##                 81                 54                 51                 72 
##     SHMT1_rs669340     SHMT1_rs643333    SHMT1_rs9901160    RFC1_rs11096990 
##                 46                 68                 51                387 
##     RFC1_rs4975002     RFC1_rs4975009     RFC1_rs6851075      APOE_rs429358 
##                 84                 58                 80                172 
##        APOE_rs7412      APOE_rs597668    ABCA1_rs1800977    ABCA1_rs2422493 
##                 59                 77                 57                 73 
##  CYP46A1_rs7157609  CYP46A1_rs4900442      DBH_rs1611115      DBH_rs1611131 
##                100                 48                 47                434 
##     GAB2_rs4945261     GAB2_rs2373115      MAPT_rs242557     MAPT_rs2471738 
##                 58                 52                 53                 94 
##     NPC1_rs2236707     NPC1_rs4800488    NR1H2_rs2695121    NR1H2_rs1052533 
##                 62                 53                 74                 89 
##    NTRK2_rs1187323    NTRK2_rs1545285        PEMT_rs7946    PEMT_rs12325817 
##                 69                 78                 80                 97 
##       VDR_rs731236      VDR_rs7975232      BIN1_rs744373     CD33_rs3865444 
##                 87                 69                 83                 58 
##     CDK5_rs2069442    HMDX1_rs2071746    HMGCR_rs3931914     IL10_rs1800896 
##                 87                 49                 58                 52 
##     LRP1_rs1799986 MTHFD1L_rs11754661     PRNP_rs1799990  SLC19A1_rs1051266 
##                 92                 70                 67                 62 
##   TRAK2_rs13022344 
##                 61

We remove those cases which are missing data for all six first columns (not genotypes)

df <- df[rowSums(is.na(df[,1:6])) != 6,]
colSums(is.na(df[,1:6]))
##      ID    Diag SexLett     Age  Centre LGC_E4. 
##       0       0       0       0       0       0

After that, we order the genotypes so that in the heterozygotes are always sorted by alphabetical order (“A”, “C”, “G”, “T”).

df[,c(vector_of_gene_snps)] <- lapply(df[,c(vector_of_gene_snps)], order_heterozygotes)

Then we create a list of objects that for each SNP contains information about its name, gene it belongs, genotype counts, genotype frequencies, allele frequencies and counts, minor allele and major allele etc.

list_of_objects <- generate_object(exists = exists('list_of_objects'),
                                   dataset = df,
                                   snps = vector_of_gene_snps,
                                   verbose = verbose)
list_of_objects
## This is just a convenience function, invocated when print() is used, to provide an overview of the genes and snps present in this list. To view its contents, use list_of_objects$your_snp_id
## 
## 
## $snps
##  [1] "rs5749131"  "rs5753231"  "rs16988828" "rs9606756"  "rs757874"  
##  [6] "rs1801198"  "rs5749135"  "rs9621049"  "rs1131603"  "rs7289553" 
## [11] "rs5997711"  "rs1801394"  "rs13181011" "rs326120"   "rs1532268" 
## [16] "rs7703033"  "rs6555501"  "rs162035"   "rs3815743"  "rs162040"  
## [21] "rs10475399" "rs17585355" "rs2228604"  "rs7536292"  "rs17646665"
## [26] "rs1149175"  "rs10745354" "rs6265"     "rs11030102" "rs11030104"
## [31] "rs11030119" "rs12288512" "rs16961153" "rs1979277"  "rs669340"  
## [36] "rs643333"   "rs9901160"  "rs11096990" "rs4975002"  "rs4975009" 
## [41] "rs6851075"  "rs429358"   "rs7412"     "rs597668"   "rs1800977" 
## [46] "rs2422493"  "rs7157609"  "rs4900442"  "rs1611115"  "rs1611131" 
## [51] "rs4945261"  "rs2373115"  "rs242557"   "rs2471738"  "rs2236707" 
## [56] "rs4800488"  "rs2695121"  "rs1052533"  "rs1187323"  "rs1545285" 
## [61] "rs7946"     "rs12325817" "rs731236"   "rs7975232"  "rs744373"  
## [66] "rs3865444"  "rs2069442"  "rs2071746"  "rs3931914"  "rs1800896" 
## [71] "rs1799986"  "rs11754661" "rs1799990"  "rs1051266"  "rs13022344"
## 
## $genes
##  [1] "TCN2"    "MTRR"    "SORT1"   "BDNF"    "SHMT1"   "RFC1"    "APOE"   
##  [8] "ABCA1"   "CYP46A1" "DBH"     "GAB2"    "MAPT"    "NPC1"    "NR1H2"  
## [15] "NTRK2"   "PEMT"    "VDR"     "BIN1"    "CD33"    "CDK5"    "HMDX1"  
## [22] "HMGCR"   "IL10"    "LRP1"    "MTHFD1L" "PRNP"    "SLC19A1" "TRAK2"  
## 
## 
## The list has a total of 75 snps and 28 genes

Each of the SNP objects present in the list_of_objects have the following structure:

list_of_objects[[1]]
## ###### rs5749131 ######
## 
## [gene] TCN2
## 
## [id] rs5749131
## 
## [geno_count] AA = 173;  AG = 478;  GG = 288;
## 
## [geno_freq] AA = 18.424;  AG = 50.905;  GG = 30.671;
## 
## [allele_count] A = 824;  G = 1054;
## 
## [allele_freq] A = 43.876;  G = 56.124;
## 
## [minor_allele] A
## 
## [major_allele] G

We can extract just the reference snp id from this object

vector_of_snps <- sapply(list_of_objects, function(x) x$id)
vector_of_snps <- unname(vector_of_snps)
vector_of_snps
##  [1] "rs5749131"  "rs5753231"  "rs16988828" "rs9606756"  "rs757874"  
##  [6] "rs1801198"  "rs5749135"  "rs9621049"  "rs1131603"  "rs7289553" 
## [11] "rs5997711"  "rs1801394"  "rs13181011" "rs326120"   "rs1532268" 
## [16] "rs7703033"  "rs6555501"  "rs162035"   "rs3815743"  "rs162040"  
## [21] "rs10475399" "rs17585355" "rs2228604"  "rs7536292"  "rs17646665"
## [26] "rs1149175"  "rs10745354" "rs6265"     "rs11030102" "rs11030104"
## [31] "rs11030119" "rs12288512" "rs16961153" "rs1979277"  "rs669340"  
## [36] "rs643333"   "rs9901160"  "rs11096990" "rs4975002"  "rs4975009" 
## [41] "rs6851075"  "rs429358"   "rs7412"     "rs597668"   "rs1800977" 
## [46] "rs2422493"  "rs7157609"  "rs4900442"  "rs1611115"  "rs1611131" 
## [51] "rs4945261"  "rs2373115"  "rs242557"   "rs2471738"  "rs2236707" 
## [56] "rs4800488"  "rs2695121"  "rs1052533"  "rs1187323"  "rs1545285" 
## [61] "rs7946"     "rs12325817" "rs731236"   "rs7975232"  "rs744373"  
## [66] "rs3865444"  "rs2069442"  "rs2071746"  "rs3931914"  "rs1800896" 
## [71] "rs1799986"  "rs11754661" "rs1799990"  "rs1051266"  "rs13022344"

And then we can convert the categorical variables in the dataframe into the factor datatype.

df[-c(1,4)] <- lapply(df[-c(1,4)], as.factor)


1.2 Imputated data set preparation


First we read and store the the imputated data.

encoded_NA <- c("#NULL!", "NA")
imput_df <- read.csv2(file.path(input, "imputated.csv"), 
                      na.strings = encoded_NA)
str(imput_df, list.len = 30)
## 'data.frame':    6357 obs. of  93 variables:
##  $ id                : int  3406 4010 12046 3784 2210 8831 1106 5426 3578 8432 ...
##  $ sex               : chr  "Woman" "Woman" "Woman" "Woman" ...
##  $ age.at.entry      : num  58 58 59 58 60 60 55 57 57 57 ...
##  $ Age.to.Use        : num  60 60 60 60 60 60 60 61 61 61 ...
##  $ Age75             : chr  "lessthan75" "lessthan75" "lessthan75" "lessthan75" ...
##  $ prevalent_dementia: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ incident_dementia : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Dx                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ DiagName          : chr  "CTRL" "CTRL" "CTRL" "CTRL" ...
##  $ ageatdementia     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ followup          : num  2.0589 1.8563 1.2238 1.9439 0.0493 ...
##  $ E4status          : chr  "E4+" "2" "E4-" "E4-" ...
##  $ apoe              : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ rs429358          : num  1.017 0.981 1.999 2 1.994 ...
##  $ rs7412            : num  1 1.05 1 2 2 ...
##  $ rs670139          : num  1 2 2 1 2 2 0 2 1 2 ...
##  $ rs242557          : int  2 NA 1 NA 1 0 0 1 1 1 ...
##  $ rs1052533         : int  1 NA 0 1 1 1 2 1 0 0 ...
##  $ rs2471738         : int  2 1 0 0 0 0 0 0 1 1 ...
##  $ rs4975002         : num  0.001 1.001 1.999 1 0.999 ...
##  $ rs1131603         : num  2 2 1.08 2 1.66 ...
##  $ rs7975232         : num  0 2 0 1.07 2 ...
##  $ rs2695121         : num  1 2 1.03 2 1.01 ...
##  $ rs3865444         : num  1.999 0.141 1.005 2 0.784 ...
##  $ rs4975009         : num  0.006 1 2 0 0.808 ...
##  $ rs1800977         : num  1 2 0 0 1 2 1 2 1 1 ...
##  $ rs709149          : num  1 2 1 1 1 0 2 0 1 1 ...
##  $ rs12288512        : num  1 2 2 1 1 ...
##  $ rs4945261         : int  2 1 0 1 2 2 2 2 2 1 ...
##  $ rs7946            : num  0.008 1 0 0 0 0 0 1 0 0 ...
##   [list output truncated]

Then we subset the data of interest.

imput_variables <- c("id", "DiagName", "sex", "Age.to.Use", "E4status")
imput_df <- imput_df[,c(imput_variables, vector_of_snps)]
dim(imput_df)
## [1] 6357   80

Then we check for missingness across all the dataset.

colSums(is.na(imput_df))
##         id   DiagName        sex Age.to.Use   E4status  rs5749131  rs5753231 
##          0          0          0          0        224        431        431 
## rs16988828  rs9606756   rs757874  rs1801198  rs5749135  rs9621049  rs1131603 
##        431        431        431        431        431        431        431 
##  rs7289553  rs5997711  rs1801394 rs13181011   rs326120  rs1532268  rs7703033 
##        431        431        431        431        431        431        431 
##  rs6555501   rs162035  rs3815743   rs162040 rs10475399 rs17585355  rs2228604 
##        431        431        431        431        431        431        431 
##  rs7536292 rs17646665  rs1149175 rs10745354     rs6265 rs11030102 rs11030104 
##        431        431        431        431        431        431        431 
## rs11030119 rs12288512 rs16961153  rs1979277   rs669340   rs643333  rs9901160 
##        431        431        431        431        431        431        431 
## rs11096990  rs4975002  rs4975009  rs6851075   rs429358     rs7412   rs597668 
##        431        431        431        431        431        431        431 
##  rs1800977  rs2422493  rs7157609  rs4900442  rs1611115  rs1611131  rs4945261 
##        431        431        431        431        431        431        431 
##  rs2373115   rs242557  rs2471738  rs2236707  rs4800488  rs2695121  rs1052533 
##        431        299        275        431        431        431        407 
##  rs1187323  rs1545285     rs7946 rs12325817   rs731236  rs7975232   rs744373 
##        431        431        431        431        431        431        431 
##  rs3865444  rs2069442  rs2071746  rs3931914  rs1800896  rs1799986 rs11754661 
##        431        431        431        431        431        431        431 
##  rs1799990  rs1051266 rs13022344 
##        431        431        431

And we remove cases which have a missing value for the column E4status.

imput_df <- imput_df[is.na(imput_df$E4status) == FALSE,]
colSums(is.na(imput_df[,1:5]))
##         id   DiagName        sex Age.to.Use   E4status 
##          0          0          0          0          0

Then, we import data from the allele schema used in the imputation, in order to reconvert it back to genotypes.

imput_scheme <- read.csv(file.path(input, "imputation_scheme.csv"), sep = ";")
imput_scheme <- imput_scheme[imput_scheme$SNP %in% vector_of_snps,]
colnames(imput_scheme) <- c("snp_id", "reference", "alternative")

There is still some snp that is not found in the reference scheme. It seems some of the snps in the dataset were directly genotyped and codified as dummy allele dosage data, so we will use the human reference genome GRCh37 to recodify them.

genotyped_snps <- setdiff(vector_of_snps, imput_scheme$snp_id)
genotyped_snps
## [1] "rs1052533"
GRCh37 <- read.table(file.path(input, "GRCh37_snps.txt"), quote = "\"", comment.char = "")
colnames(GRCh37) <- c("chromosome", "location", "snp_id", "reference", "alternative")
index <- nrow(imput_scheme)
for (snp in genotyped_snps) {
  i <- which(genotyped_snps == snp)
  imput_scheme[index + i,] <- GRCh37[GRCh37$snp_id == snp,][,c(3,4,5)]
}
row.names(imput_scheme) <- NULL

So the final dataframe used to recodify the imputated data into genotypes will be the following one:

imput_scheme
snp_id reference alternative
rs10475399 A G
rs1051266 T C
rs1052533 G A
rs10745354 T C
rs11030102 C G
rs11030104 A G
rs11030119 G A
rs11096990 C T
rs1131603 T C
rs1149175 G A
rs11754661 G A
rs1187323 A C
rs12288512 G A
rs12325817 C G
rs13022344 T C
rs13181011 T C
rs1532268 C T
rs1545285 T G
rs1611115 C T
rs1611131 A G
rs162035 G A
rs162040 A C
rs16961153 G A
rs16988828 A G
rs17585355 A C
rs17646665 A G
rs1799986 C T
rs1799990 A G
rs1800896 T C
rs1800977 G A
rs1801198 C G
rs1801394 A G
rs1979277 G A
rs2069442 C G
rs2071746 A T
rs2228604 G T
rs2236707 C T
rs2373115 C A
rs2422493 G A
rs242557 A G
rs2471738 T C
rs2695121 C T
rs326120 A G
rs3815743 A G
rs3865444 C A
rs3931914 C G
rs429358 T C
rs4800488 A C
rs4900442 C T
rs4945261 G A
rs4975002 T G
rs4975009 T C
rs5749131 G A
rs5749135 C T
rs5753231 C T
rs597668 T C
rs5997711 C T
rs6265 C T
rs643333 G T
rs6555501 T C
rs669340 C G
rs6851075 T C
rs7157609 G A
rs7289553 A G
rs731236 A G
rs7412 C T
rs744373 A G
rs7536292 T C
rs757874 G T
rs7703033 G A
rs7946 C T
rs7975232 C A
rs9606756 A G
rs9621049 C T
rs9901160 G A

First, though the imputation data is converted to the numeric datatype.

imput_df[, 6:ncol(imput_df)] <- lapply(imput_df[,6:ncol(imput_df)], 
                                       function(x) as.numeric(x))

Then we proceed to the recoding of the imputation data into genotyped one.

imput_df <- genotype_imputated_df(list_of_objects = list_of_objects, 
                                  df = imput_df, 
                                  scheme = imput_scheme,
                                  match_strands = T,
                                  verbose = verbose)

Then again like we did with the genotyped dataset, we order the genotypes so that in the heterozygotes are always sorted by alphabetical order (“A”, “C”, “G”, “T”).

imput_df[,c(vector_of_snps)] <- lapply(imput_df[,c(vector_of_snps)], order_heterozygotes)

And we convert data types from characters to factors

imput_df[-c(1,4)] <- lapply(imput_df[-c(1,4)], as.factor)


1.3 Merging imputated and genotyped datasets


Renaming variables names in the datasets

colnames(df) <- c("ID", "Diag", "Sex", "Age_to_use", "Centre", "E4status", vector_of_snps)
colnames(imput_df) <- c("ID", "Diag", "Sex", "Age_to_use", "E4status", vector_of_snps)

As we are studying LOAD, subset cases higher or equal than 60 years old in the datasets

df_old <- df
imput_df_old <- imput_df
df <- df[df$Age_to_use >= 60,]
imput_df <- imput_df[imput_df$Age_to_use >= 60,]

And we can see that 81 cases were removed from the genotype dataset, and 0 from the imputated.

nrow(df_old) - nrow(df)
## [1] 81
# 81

nrow(imput_df_old) - nrow(imput_df)
## [1] 0
# 0

We can calculate the median of the combined population

median(c(df$Age_to_use, imput_df$Age_to_use))
## [1] 82
# 82

Renaming the levels of E4status in the genotyped dataset.

levels(df$E4status)
## [1] "0" "1"
levels(df$E4status) <- c("E4-", "E4+")

And the levels of Diagnosis in the imputated dataset.

levels(imput_df$Diag)
## [1] "CTRL"       "dementiaAD"
levels(imput_df$Diag) <- c("Control", "AD")

We also create a binary variable in both datasets that tells whether the age of the individual is equal or above the median of the population. In this case, as we saw earlier it is 82.

df$Age82 <- ifelse(df$Age_to_use >= 82, ">82", "<82")
imput_df$Age82 <- ifelse(imput_df$Age_to_use >= 82, ">82", "<82")
df$Age82 <- as.factor(df$Age82)

We found a case of unknown gender in the imputated dataset, so we remove it and reset the levels. And then rename them as “Male” and “Female”.

table(imput_df$Sex)
## 
##     1   Man Woman 
##     1  2463  3669
imput_df$Sex <- as.character(imput_df$Sex)
imput_df <- imput_df[imput_df$Sex != 1,]
imput_df$Sex <- factor(imput_df$Sex) 
levels(imput_df$Sex)
## [1] "Man"   "Woman"
levels(imput_df$Sex) <- c("Male", "Female")

After a quick look at the data we decide to codify as missing an unexplained level in the E4status variable at the Rotterdam dataset. And remove those missing cases.

table(imput_df$E4status)
## 
##    2  E4-  E4+ 
##  141 4423 1568
imput_df$E4status <- as.character(imput_df$E4status)
imput_df <- imput_df[imput_df$E4status != 2,]
imput_df$E4status <- factor(imput_df$E4status)

And we create a new variable “Centre” to keep track that this cases come from the center of Rotterdam.

imput_df$Centre <- "ROTTERDAM"

The last step before merging is extracting the data from Rotterdam to fill up the list_of_objects created in a previous step.

list_of_objects <- generate_object(exists = exists('list_of_objects'), 
                                   dataset = imput_df, 
                                   snps = vector_of_snps)

Then, the final structure obtained would be similar to the following one:

str(list_of_objects)
## This is just a convenience function, invocated when str() is used, to provide an overview of the list elements. To view its contents, use list_of_objects$your_snp_id
## 
## An example of the structure of the objects in the list is the following one:
## 
## rs5749131 : List of 14
##  $ gene              : chr "TCN2"
##  $ id                : chr "rs5749131"
##  $ geno_count        : 'table' int [1:3(1d)] 173 478 288
##  $ geno_freq         : 'table' num [1:3(1d)] 18.4 50.9 30.7
##  $ allele_count      : Named num [1:2] 824 1054
##  $ allele_freq       : Named num [1:2] 43.9 56.1
##  $ minor_allele      : chr "A"
##  $ major_allele      : chr "G"
##  $ imput_geno_count  : 'table' int [1:3(1d)] 889 2224 1455
##  $ imput_geno_freq   : 'table' num [1:3(1d)] 19.5 48.7 31.9
##  $ imput_allele_count: Named num [1:2] 4002 5134
##  $ imput_allele_freq : Named num [1:2] 43.8 56.2
##  $ imput_minor_allele: chr "A"
##  $ imput_major_allele: chr "G"

Once we’ve done that, we can proceed with the merging.

matching_variables <- c("ID", "Diag", "Sex", "E4status", "Age_to_use", "Age82", "Centre", vector_of_snps)
All <- merge(x = df, y = imput_df, by = matching_variables, all = T)
str(All, list.len = 30)
## 'data.frame':    8351 obs. of  82 variables:
##  $ ID        : chr  "10" "100" "1000" "10000" ...
##  $ Diag      : Factor w/ 2 levels "AD","Control": 2 2 1 2 2 2 2 2 2 2 ...
##  $ Sex       : Factor w/ 2 levels "Female","Male": 2 1 2 1 1 2 1 2 2 1 ...
##  $ E4status  : Factor w/ 2 levels "E4-","E4+": 1 1 2 1 1 1 1 1 2 1 ...
##  $ Age_to_use: num  82 77 82 78 92 78 82 83 86 87 ...
##  $ Age82     : Factor w/ 2 levels "<82",">82": 2 1 2 1 2 1 2 2 2 2 ...
##  $ Centre    : Factor w/ 7 levels "BRISTOL","MADRID",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ rs5749131 : Factor w/ 3 levels "AA","AG","GG": 1 3 2 2 3 2 2 2 2 2 ...
##  $ rs5753231 : Factor w/ 3 levels "CC","CT","TT": 2 1 1 2 1 2 2 1 1 2 ...
##  $ rs16988828: Factor w/ 3 levels "AA","AG","GG": 1 2 1 1 1 1 1 1 1 1 ...
##  $ rs9606756 : Factor w/ 3 levels "AA","AG","GG": 1 2 1 1 1 1 1 2 1 2 ...
##  $ rs757874  : Factor w/ 3 levels "GG","GT","TT": 1 1 2 2 1 2 2 1 1 1 ...
##  $ rs1801198 : Factor w/ 3 levels "CC","CG","GG": 3 1 2 2 1 2 2 2 2 2 ...
##  $ rs5749135 : Factor w/ 3 levels "CC","CT","TT": 1 2 2 2 3 2 2 1 2 1 ...
##  $ rs9621049 : Factor w/ 3 levels "CC","CT","TT": 1 2 1 1 1 1 1 2 1 2 ...
##  $ rs1131603 : Factor w/ 3 levels "CC","CT","TT": 3 3 3 3 2 3 3 3 3 3 ...
##  $ rs7289553 : Factor w/ 3 levels "AA","AG","GG": 1 3 2 2 3 2 2 1 2 2 ...
##  $ rs5997711 : Factor w/ 3 levels "CC","CT","TT": 3 1 2 2 1 2 2 2 2 2 ...
##  $ rs1801394 : Factor w/ 3 levels "AA","AG","GG": 3 2 1 3 3 3 1 2 3 3 ...
##  $ rs13181011: Factor w/ 3 levels "CC","CT","TT": 1 3 3 3 2 2 3 3 2 2 ...
##  $ rs326120  : Factor w/ 3 levels "AA","AG","GG": 1 2 1 1 1 1 3 2 1 1 ...
##  $ rs1532268 : Factor w/ 3 levels "AA","AG","GG": 1 3 2 3 2 2 3 3 2 2 ...
##  $ rs7703033 : Factor w/ 3 levels "AA","AG","GG": 3 2 3 1 2 2 3 2 2 2 ...
##  $ rs6555501 : Factor w/ 3 levels "CC","CT","TT": 3 2 3 1 2 2 3 2 2 2 ...
##  $ rs162035  : Factor w/ 3 levels "AA","AG","GG": 3 3 3 2 3 3 3 3 3 3 ...
##  $ rs3815743 : Factor w/ 3 levels "AA","AG","GG": 1 1 2 1 1 1 1 1 1 1 ...
##  $ rs162040  : Factor w/ 3 levels "AA","AC","CC": 1 2 1 1 1 1 3 1 1 1 ...
##  $ rs10475399: Factor w/ 3 levels "AA","AG","GG": 3 2 2 2 3 3 1 2 3 3 ...
##  $ rs17585355: Factor w/ 3 levels "AA","AC","CC": 1 1 1 1 1 1 1 1 1 2 ...
##  $ rs2228604 : Factor w/ 3 levels "AA","AC","CC": 3 3 2 2 1 2 3 2 2 2 ...
##   [list output truncated]

Before proceeding into quality control we should check missingness in the obtained dataset.

colSums(is.na(All))
##         ID       Diag        Sex   E4status Age_to_use      Age82     Centre 
##          0          0          0          0          0          0          0 
##  rs5749131  rs5753231 rs16988828  rs9606756   rs757874  rs1801198  rs5749135 
##        458        450        423        432        444        440        425 
##  rs9621049  rs1131603  rs7289553  rs5997711  rs1801394 rs13181011   rs326120 
##        434        420        465        443        439        420        454 
##  rs1532268  rs7703033  rs6555501   rs162035  rs3815743   rs162040 rs10475399 
##        451        439        453        425        462        445        453 
## rs17585355  rs2228604  rs7536292 rs17646665  rs1149175 rs10745354     rs6265 
##        427        437        461        415        423        436        424 
## rs11030102 rs11030104 rs11030119 rs12288512 rs16961153  rs1979277   rs669340 
##        460        445        459        432        429        448        423 
##   rs643333  rs9901160 rs11096990  rs4975002  rs4975009  rs6851075   rs429358 
##        445        427        764        459        435        456        548 
##     rs7412   rs597668  rs1800977  rs2422493  rs7157609  rs4900442  rs1611115 
##        437        453        434        451        477        426        425 
##  rs1611131  rs4945261  rs2373115   rs242557  rs2471738  rs2236707  rs4800488 
##        809        436        430        310        327        439        431 
##  rs2695121  rs1052533  rs1187323  rs1545285     rs7946 rs12325817   rs731236 
##        450        444        446        452        456        474        465 
##  rs7975232   rs744373  rs3865444  rs2069442  rs2071746  rs3931914  rs1800896 
##        446        461        436        461        425        436        428 
##  rs1799986 rs11754661  rs1799990  rs1051266 rs13022344 
##        470        447        443        440        439

Finally, we divide the dataset into regions according to the centres.

All$Region <- ifelse(All$Centre == "MADRID" | All$Centre == "OVIEDO" | All$Centre == "SANTANDER", "Spain", "N.Eur")


1.4 Quality control


We will perform a quality control of the samples before proceeding to the analysis step. An example of typical quality control procedures usually performed can be found here


1.4.1 Formatting dataset


Once we’ve merged both datasets we can proceed with quality control procedures.

We’ve previosly prepared a map file with the snps under study. It was generated with the GRCh38 genome assembly version.

map <- read.delim(file.path(input, "raw_data.map"), header = FALSE)
colnames(map) <- c("Chr", "SNP", "GD", "BPP")

The file has the following structure:

map
Chr SNP GD BPP
1 rs10745354 0 109389286
1 rs1149175 0 109379755
1 rs17585355 0 109315193
1 rs17646665 0 109369429
1 rs1800896 0 206773552
1 rs2228604 0 109342153

We can see that the snps ids are exactly the same found in our dataset so we already got all the information we need for the map file.

setdiff(map$SNP, names(list_of_objects))
## character(0)
setdiff(names(list_of_objects), map$SNP)
## character(0)

Now, we take care of the ped file. First, we create an empty ped file structure that we will fill up with the data.

ped <- data.frame(matrix(ncol = 6, nrow = nrow(All)))
colnames(ped) <- c("FID", "IID", "PID", "MID", "Sex", "P")

Then we introduce the columns values for each of the defined variables

ped$FID <- All$ID #It is the same plink does when the flag --no-fid is used
ped$IID <- All$ID
ped$PID <- 0
ped$MID <- 0
ped$Sex <- sapply(All$Sex, function(x) {if (is.na(x)) {0} else if (x == "Male") {1} else if (x == "Female") {2} else {0}})
ped$P <- sapply(All$Diag, function(x) {if (is.na(x)) {-9} else if (x == "Control") {1} else if (x == "AD") {2} else {-9}})

If we take look we can see that the variables have been coded correctly

head(All[,c("ID", "Sex", "Diag")])
##      ID    Sex    Diag
## 1    10   Male Control
## 2   100 Female Control
## 3  1000   Male      AD
## 4 10000 Female Control
## 5 10004 Female Control
## 6 10005   Male Control
head(ped[,c("IID", "Sex", "P")])
##     IID Sex P
## 1    10   1 1
## 2   100   2 1
## 3  1000   1 2
## 4 10000   2 1
## 5 10004   2 1
## 6 10005   1 1

Now, then the final step would be to add the genotypes, and as the version of plink we are using (PLINK v1.90) parses the genotypes, we don’t need to split the columns we have into allele columns and we can just bind it directly to our ped file.

ped <- cbind(ped, All[,map$SNP])

But we have yet some missing values remaining in our dataset.

colSums(is.na(ped))
##        FID        IID        PID        MID        Sex          P rs10745354 
##          0          0          0          0          0          0        436 
##  rs1149175 rs17585355 rs17646665  rs1800896  rs2228604  rs7536292 rs13022344 
##        423        427        415        428        437        461        439 
##   rs744373 rs11096990  rs4975002  rs4975009  rs6851075 rs10475399 rs13181011 
##        461        764        459        435        456        453        420 
##  rs1532268   rs162035   rs162040  rs1801394   rs326120  rs3815743  rs3931914 
##        451        425        445        439        454        462        436 
##  rs6555501  rs7703033 rs11754661  rs2069442  rs1187323  rs1545285  rs1611115 
##        453        439        447        461        446        452        425 
##  rs1611131  rs1800977  rs2422493 rs11030102 rs11030104 rs11030119 rs12288512 
##        809        434        451        460        445        459        432 
##  rs2373115  rs4945261     rs6265  rs1799986   rs731236  rs7975232  rs4900442 
##        430        436        424        470        465        446        426 
##  rs7157609 rs12325817 rs16961153  rs1979277   rs242557  rs2471738   rs643333 
##        477        474        429        448        310        327        445 
##   rs669340     rs7946  rs9901160  rs2236707  rs4800488  rs1052533  rs2695121 
##        423        456        427        439        431        444        450 
##  rs3865444   rs429358   rs597668     rs7412  rs1799990  rs1051266  rs1131603 
##        436        548        453        437        443        440        420 
## rs16988828  rs1801198  rs2071746  rs5749131  rs5749135  rs5753231  rs5997711 
##        423        440        425        458        425        450        443 
##  rs7289553   rs757874  rs9606756  rs9621049 
##        465        444        432        434

And we convert the missing values to 00, as plink expects.

ped[,map$SNP] <- lapply(ped[,map$SNP], as.character)
ped[is.na(ped)] <- "00"
sum(is.na(ped))
## [1] 0

Then we can save the ped file in order to use it as input for plink.

write.table(ped, file = file.path(input, "raw_data.ped"), quote = F, sep = "\t", row.names = F, col.names = F)


1.4.3 Making binary files


The first step is to create a folder where to store the output that we will obtain in all subsequent steps using the program.

mkdir -p "$PWD"/data/Output/plink/0-Data
run_shell('mkdir %CD%/data/Output/plink/0-Data')

The next step step will be to convert the files obtained into binary files to speed up the computation.

plink --file "$PWD"/data/Input/raw_data --make-bed --out "$PWD"/data/Output/plink/0-Data/binary_data
run_shell('plink --file %CD%/data/Input/raw_data --make-bed --out %CD%/data/Output/plink/0-Data/binary_data', ignore.stdout = !verbose)


1.4.4 Removing missingness


We first filter SNPs and individuals based on a relaxed threshold (0.2; >20%), as this will filter out SNPs and individuals with very high levels of missingness. Then a filter with a more stringent threshold will be applied (0.02). We do first SNP filtering before individual filtering.

mkdir -p "$PWD"/data/Output/plink/1-QC/1-Missingness
plink --bfile "$PWD"/data/Output/plink/0-Data/binary_data --missing --out "$PWD"/data/Output/plink/1-QC/1-Missingness/missing
plink --bfile "$PWD"/data/Output/plink/0-Data/binary_data --geno 0.2 --make-bed --out "$PWD"/data/Output/plink/1-QC/1-Missingness/missing_1
plink --bfile "$PWD"/data/Output/plink/1-QC/1-Missingness/missing_1 --mind 0.2 --make-bed --out "$PWD"/data/Output/plink/1-QC/1-Missingness/missing_2
plink --bfile "$PWD"/data/Output/plink/1-QC/1-Missingness/missing_2 --geno 0.02 --make-bed --out "$PWD"/data/Output/plink/1-QC/1-Missingness/missing_3
plink --bfile "$PWD"/data/Output/plink/1-QC/1-Missingness/missing_3 --mind 0.02 --make-bed --out "$PWD"/data/Output/plink/1-QC/1-Missingness/missing_4
run_shell('mkdir %CD%/data/Output/plink/1-QC/1-Missingness')
run_shell('plink --bfile %CD%/data/Output/plink/0-Data/binary_data --missing --out %CD%/data/Output/plink/1-QC/1-Missingness/missing', ignore.stdout = !verbose)
run_shell('plink --bfile %CD%/data/Output/plink/0-Data/binary_data --geno 0.2 --make-bed --out %CD%/data/Output/plink/1-QC/1-Missingness/missing_1', ignore.stdout = !verbose)
run_shell('plink --bfile %CD%/data/Output/plink/1-QC/1-Missingness/missing_1 --mind 0.2 --make-bed --out %CD%/data/Output/plink/1-QC/1-Missingness/missing_2', ignore.stdout = !verbose)
run_shell('plink --bfile %CD%/data/Output/plink/1-QC/1-Missingness/missing_2 --geno 0.02 --make-bed --out %CD%/data/Output/plink/1-QC/1-Missingness/missing_3', ignore.stdout = !verbose)
run_shell('plink --bfile %CD%/data/Output/plink/1-QC/1-Missingness/missing_3 --mind 0.02 --make-bed --out %CD%/data/Output/plink/1-QC/1-Missingness/missing_4', ignore.stdout = !verbose)


1.4.5 Removing SNPs with lower MAF than threshold


SNPs with a low MAF are rare, therefore power is lacking for detecting SNP‐phenotype associations. These SNPs are also more prone to genotyping errors. The MAF threshold depends on sample size, larger samples can use lower MAF thresholds. As the sample size in this study is moderate (N = 8716), we use the 0.05 as MAF threshold.

mkdir "$PWD"/data/Output/plink/1-QC/2-MAF
plink --bfile "$PWD"/data/Output/plink/1-QC/1-Missingness/missing_4 --maf 0.05 --make-bed --out "$PWD"/data/Output/plink/1-QC/2-MAF/minor_allele
run_shell('mkdir %CD%/data/Output/plink/1-QC/2-MAF')
run_shell('plink --bfile %CD%/data/Output/plink/1-QC/1-Missingness/missing_4 --maf 0.05 --make-bed --out %CD%/data/Output/plink/1-QC/2-MAF/minor_allele', ignore.stdout = !verbose)


1.4.6 Looking at deviations from Hardy-Weinberg equilibrium


Deviations from HWE is a common indicator of genotyping error, but it may also indicate evolutionary selection. We will use a threshold of 10^-6 for controls and 10^-10 for cases.

mkdir "$PWD"/data/Output/plink/1-QC/3-HWE
plink --bfile "$PWD"/data/Output/plink/1-QC/2-MAF/minor_allele --hwe 1e-4 --make-bed --out "$PWD"/data/Output/plink/1-QC/3-HWE/hardy_weinberg
plink --bfile "$PWD"/data/Output/plink/1-QC/3-HWE/hardy_weinberg --hwe 1e-10 include-nonctrl --make-bed --out "$PWD"/data/Output/plink/1-QC/3-HWE/hardy_weinberg_1
run_shell('mkdir %CD%/data/Output/plink/1-QC/3-HWE')
run_shell('plink --bfile %CD%/data/Output/plink/1-QC/2-MAF/minor_allele --hwe 1e-6 --make-bed --out %CD%/data/Output/plink/1-QC/3-HWE/hardy_weinberg', ignore.stdout = !verbose)
run_shell('plink --bfile %CD%/data/Output/plink/1-QC/3-HWE/hardy_weinberg --hwe 1e-10 include-nonctrl --make-bed --out %CD%/data/Output/plink/1-QC/3-HWE/hardy_weinberg_1', ignore.stdout = !verbose)


1.4.7 Removing individuals with a heterozygosity rate deviating more than 3SD from the mean


Deviations can indicate sample contamination or inbreeding. Heterozygosity checks are performed on a set of SNPs that are not highly correlated. To generate this list of non-(highly)correlated SNPs, we exclude high inversion regions.

mkdir "$PWD"/data/Output/plink/1-QC/4-Het
plink --bfile "$PWD"/data/Output/plink/1-QC/3-HWE/hardy_weinberg_1 --exclude "$PWD"/data/Input/inversion.txt --range --indep-pairwise 50 5 0.2 --out "$PWD"/data/Output/plink/1-QC/4-Het/indepSNP
run_shell('mkdir %CD%/data/Output/plink/1-QC/4-Het')
run_shell('plink --bfile %CD%/data/Output/plink/1-QC/3-HWE/hardy_weinberg_1 --exclude %CD%/data/Input/inversion.txt --range --indep-pairwise 50 5 0.2 --out %CD%/data/Output/plink/1-QC/4-Het/indepSNP', ignore.stdout = !verbose)

Once we’ve done that we can compute the heterozygosity rates

plink --bfile "$PWD"/data/Output/plink/1-QC/3-HWE/hardy_weinberg_1 --extract "$PWD"/data/Output/plink/1-QC/4-Het/indepSNP.prune.in --het --out "$PWD"/data/Output/plink/1-QC/4-Het/Het_check
run_shell('plink --bfile %CD%/data/Output/plink/1-QC/3-HWE/hardy_weinberg_1 --extract %CD%/data/Output/plink/1-QC/4-Het/indepSNP.prune.in --het --out %CD%/data/Output/plink/1-QC/4-Het/Het_check', ignore.stdout = !verbose)

From this file we can generate a list of individuals who deviate more than 3 standard deviations from the heterozigosity rate mean.

# All credit to Andries Marees at https://github.com/MareesAT/GWA_tutorial
het <- read.table(file.path(output, "plink", "1-QC", "4-Het", "Het_check.het"), head = TRUE)
het$HET_RATE <-  (het$"N.NM." - het$"O.HOM.")/het$"N.NM."
het_fail <-  subset(het, (het$HET_RATE < mean(het$HET_RATE) - 3*sd(het$HET_RATE)) | (het$HET_RATE > mean(het$HET_RATE) + 3*sd(het$HET_RATE)))
het_fail$HET_DST <-  (het_fail$HET_RATE - mean(het$HET_RATE))/sd(het$HET_RATE)
write.table(het_fail[,1:2], file.path(output, "plink", "1-QC", "4-Het", "fail-het-qc.txt"), row.names = FALSE, quote = F)

Now, we proceed to remove those heterozygosity rate outliers

plink --bfile "$PWD"/data/Output/plink/1-QC/3-HWE/hardy_weinberg_1 --remove "$PWD"/data/Output/plink/1-QC/4-Het/fail-het-qc.txt --make-bed --out "$PWD"/data/Output/plink/1-QC/4-Het/heterozigosity
run_shell('plink --bfile %CD%/data/Output/plink/1-QC/3-HWE/hardy_weinberg_1 --remove %CD%/data/Output/plink/1-QC/4-Het/fail-het-qc.txt --make-bed --out %CD%/data/Output/plink/1-QC/4-Het/heterozigosity', ignore.stdout = !verbose)


1.4.8 Population structure analysis


We can also check for population substrure in our dataset, for instance by performing a PCA.

This additional analysis is only actually performed on Unix computers, if in windows I have just provided the commands used and and example of the output obtained. It also isn’t run by default, because of the bottleneck caused by this additional analysis. The results obtained are shown as recorded on a .png file of the PCA plot. You can control if the analysis should be performed by setting the run_PCA variable as TRUE or FALSE.

run_PCA <- FALSE

For that, we downloaded a 1000 genomes file of 629 individuals which were genotyped in blablabla SNPs. This file was downloaded from here: ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/20100804/ALL.2of4intersection.20100804.genotypes.vcf.gz.

After downloading it the file was converted to plink files, and performed QC as described in the Population Stratification tutorial in https://github.com/MareesAT/GWA_tutorial.


1.4.8.1 Removing unshared SNPs


Just the snps present in both datasets were kept to be able to make the comparison. Due to file size, of the thousand genomes files, we’ve already extracted the relevant SNPs from the thousand genomes plink files. To do so for the EP dataset we might do:

mkdir -p "$PWD"/data/Output/plink/2-POP/
awk '{print$2}' "$PWD"/data/Input/1KG_PCA/1KG.bim > "$PWD"/data/Output/plink/2-POP/1KG_snps.txt
plink --bfile "$PWD"/data/Output/plink/1-QC/4-Het/heterozigosity --extract "$PWD"/data/Output/plink/2-POP/1KG_snps.txt --make-bed --recode --out "$PWD"/data/Output/plink/2-POP/EP
cat "$PWD"/data/Output/plink/2-POP/1KG_snps.txt | wc -l


1.4.8.2 Update build


Then, the next step would be to check that both datasets have the same build and if not update the older 1KG file accordingly.

awk '{print$2,$4}' "$PWD"/data/Output/plink/2-POP/EP.map > "$PWD"/data/Output/plink/2-POP/buildEP.txt
plink --bfile "$PWD"/data/Input/1KG_PCA/1KG --update-map "$PWD"/data/Output/plink/2-POP/buildEP.txt --make-bed --out "$PWD"/data/Output/plink/2-POP/1KG_PCA


1.4.8.3 Prepare merging


In order to merge both files, first we have to ascertain that both files refer to the same reference and alternative alleles.

awk '{print$2,$5}' "$PWD"/data/Output/plink/2-POP/1KG_PCA.bim > "$PWD"/data/Output/plink/2-POP/1KG_ref-list.txt
plink --bfile "$PWD"/data/Output/plink/2-POP/EP --reference-allele "$PWD"/data/Output/plink/2-POP/1KG_ref-list.txt --make-bed --out "$PWD"/data/Output/plink/2-POP/EP_adj
awk '{print$2,$5,$6}' "$PWD"/data/Output/plink/2-POP/EP_adj.bim > "$PWD"/data/Output/plink/2-POP/EP_adj_tmp
awk '{print$2,$5,$6}' "$PWD"/data/Output/plink/2-POP/1KG_PCA.bim > "$PWD"/data/Output/plink/2-POP/1KG_PCA_tmp
sort "$PWD"/data/Output/plink/2-POP/1KG_PCA_tmp "$PWD"/data/Output/plink/2-POP/EP_adj_tmp | uniq -u > "$PWD"/data/Output/plink/2-POP/all_differences.txt
cat "$PWD"/data/Output/plink/2-POP/all_differences.txt

Those differences might be just caused by looking at different strands so we will flip those bases and see if the error persists.

awk '{print$1}' "$PWD"/data/Output/plink/2-POP/all_differences.txt | sort -u > "$PWD"/data/Output/plink/2-POP/flip_list.txt
plink --bfile "$PWD"/data/Output/plink/2-POP/EP_adj --flip "$PWD"/data/Output/plink/2-POP/flip_list.txt --reference-allele "$PWD"/data/Output/plink/2-POP/1KG_ref-list.txt --make-bed --out "$PWD"/data/Output/plink/2-POP/EP_adj_2

## Now check if they are still problematic
awk '{print$2,$5,$6}' "$PWD"/data/Output/plink/2-POP/EP_adj_2.bim > "$PWD"/data/Output/plink/2-POP/EP_adj_2_tmp
sort "$PWD"/data/Output/plink/2-POP/1KG_PCA_tmp "$PWD"/data/Output/plink/2-POP/EP_adj_2_tmp | uniq -u > "$PWD"/data/Output/plink/2-POP/still_differences.txt
cat "$PWD"/data/Output/plink/2-POP/still_differences.txt


1.4.8.4 Merge


And we can see that there are no remaining differences anymore so we shouldn’t remove any SNP and we can proceed to merge both files.

plink --bfile "$PWD"/data/Output/plink/2-POP/EP_adj_2 --bmerge "$PWD"/data/Output/plink/2-POP/1KG_PCA.bed "$PWD"/data/Output/plink/2-POP/1KG_PCA.bim "$PWD"/data/Output/plink/2-POP/1KG_PCA.fam --allow-no-sex --make-bed --out "$PWD"/data/Output/plink/2-POP/EP_1KG_PCA


1.4.8.5 Labeling the superpopulations


Before doing the PCA we can label the data according to different human superpopulations (AFR, EUR etc) in order to be able to keep track of them when we plot them graphically. For that we get a file with population information from the 1000 genomes and transform the subpopulation codes in our files in superpopulation ones.

# Note: for macOS/BSD users use GNU sed instead of default sed, linux users may use sed directly instead of gsed
wget -P "$PWD"/data/Output/plink/2-POP/ ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20100804/20100804.ALL.panel
## rename populations
awk '{print$1,$1,$2}' "$PWD"/data/Output/plink/2-POP/20100804.ALL.panel > "$PWD"/data/Output/plink/2-POP/pop_1kG.txt
gsed 's/JPT/ASN/g' "$PWD"/data/Output/plink/2-POP/pop_1kG.txt > "$PWD"/data/Output/plink/2-POP/pop_1kG2.txt
gsed 's/ASW/AFR/g' "$PWD"/data/Output/plink/2-POP/pop_1kG2.txt > "$PWD"/data/Output/plink/2-POP/pop_1kG3.txt
gsed 's/CEU/EUR/g' "$PWD"/data/Output/plink/2-POP/pop_1kG3.txt > "$PWD"/data/Output/plink/2-POP/pop_1kG4.txt
gsed 's/CHB/ASN/g' "$PWD"/data/Output/plink/2-POP/pop_1kG4.txt > "$PWD"/data/Output/plink/2-POP/pop_1kG5.txt
gsed 's/CHD/ASN/g' "$PWD"/data/Output/plink/2-POP/pop_1kG5.txt>"$PWD"/data/Output/plink/2-POP/pop_1kG6.txt
gsed 's/YRI/AFR/g' "$PWD"/data/Output/plink/2-POP/pop_1kG6.txt>"$PWD"/data/Output/plink/2-POP/pop_1kG7.txt
gsed 's/LWK/AFR/g' "$PWD"/data/Output/plink/2-POP/pop_1kG7.txt>"$PWD"/data/Output/plink/2-POP/pop_1kG8.txt
gsed 's/TSI/EUR/g' "$PWD"/data/Output/plink/2-POP/pop_1kG8.txt>"$PWD"/data/Output/plink/2-POP/pop_1kG9.txt
gsed 's/MXL/AMR/g' "$PWD"/data/Output/plink/2-POP/pop_1kG9.txt>"$PWD"/data/Output/plink/2-POP/pop_1kG10.txt
gsed 's/GBR/EUR/g' "$PWD"/data/Output/plink/2-POP/pop_1kG10.txt>"$PWD"/data/Output/plink/2-POP/pop_1kG11.txt
gsed 's/FIN/EUR/g' "$PWD"/data/Output/plink/2-POP/pop_1kG11.txt>"$PWD"/data/Output/plink/2-POP/pop_1kG12.txt
gsed 's/CHS/ASN/g' "$PWD"/data/Output/plink/2-POP/pop_1kG12.txt>"$PWD"/data/Output/plink/2-POP/pop_1kG13.txt
gsed 's/PUR/AMR/g' "$PWD"/data/Output/plink/2-POP/pop_1kG13.txt>"$PWD"/data/Output/plink/2-POP/pop_1kG14.txt

# Create our own population file
awk '{print$1,$2,"EP"}' "$PWD"/data/Output/plink/2-POP/EP_adj_2.fam > "$PWD"/data/Output/plink/2-POP/pop_EP.txt

# Concatenate population files.
cat "$PWD"/data/Output/plink/2-POP/pop_1kG14.txt "$PWD"/data/Output/plink/2-POP/pop_EP.txt | gsed -e '1i\FID IID SUPERPOP' > "$PWD"/data/Output/plink/2-POP/popfile.txt

# remove temporary files from above
rm -f "$PWD"/data/Output/plink/2-POP/pop_*.txt


1.4.8.6 PCA


With the data merged we can perform PCA on plink.

plink --bfile "$PWD"/data/Output/plink/2-POP/EP_1KG_PCA --pca --out "$PWD"/data/Output/plink/2-POP/PCA_results

We will plot the results using a small Python script (all credit to Xavier Farré)

# engine.opts='-l' loads files that are normally loaded when you start the shell like for instance ~/.bash_profile. In this case, python 3 is aliased to python, instead of using python 2.7.
python --version
python "$PWD"/data/Input/1KG_PCA/plot_PCA.py
PCA performed on the EP dataset

PCA performed on the EP dataset

So, as we can see there is no clustering visible between any of the populations. That is probably because of the low number of SNPs available (n < 70) and little to no allele frequency between these superpopulations for these markers.

Thus, no further sample was removed from the EP dataset because even if it was not from a European individual it would not affect the outcome of the results.


1.4.9 Resulting cases and SNPs


To obtain the SNPs and cases that are kept after QC, first we can recode the binary fails obtained into map and ped files

mkdir -p "$PWD"/data/Output/plink/1-QC/Final
plink --bfile "$PWD"/data/Output/plink/1-QC/4-Het/heterozigosity --recode --out "$PWD"/data/Output/plink/1-QC/Final/after_QC
run_shell('mkdir %CD%/data/Output/plink/1-QC/Final')
run_shell('plink --bfile "$PWD"/data/Output/plink/1-QC/4-Het/heterozigosity --recode --out "$PWD"/data/Output/plink/1-QC/Final/after_QC')

Then we can import those files.

path <- file.path(output, "plink", "1-QC", "Final")
QC_map <- read.delim(file.path(path, "after_QC.map"), header = FALSE)
QC_ped <- read.table(file.path(path, "after_QC.ped"), quote = "\"", comment.char = "")

The second column of the map file contains the SNPs that remain after the QC and the second column of the ped file contains the IIDs kept. Thus, we can see the difference between those in the dataset without filtering and those kept after QC procedures.

snps_remain <- QC_map$V2
ids_remain <- QC_ped$V2
snps_removed <- setdiff(names(list_of_objects), snps_remain)
ids_removed <- setdiff(All$ID, ids_remain)
length(snps_removed)
## [1] 7
length(ids_removed)
## [1] 855

So 6 SNPs were removed as well as 874 cases.

First we can see to which genes those SNPs belonged.

genes <- sapply(snps_removed, function(SNP) {list_of_objects[[SNP]]$gene})
sort(table(genes), decreasing = T)
## genes
##  MAPT  APOE   DBH NR1H2  RFC1  TCN2 
##     2     1     1     1     1     1

And then we can obtain the center from which the cases removed were obtained and the diagnosis group.

centers <- sapply(ids_removed, function(id) {All$Centre[All$ID == id]}, USE.NAMES = F)
sort(table(centers), decreasing = T)
## centers
##  ROTTERDAM NOTTINGHAM     MADRID     OVIEDO    BRISTOL     OPTIMA  SANTANDER 
##        421        127        115         61         57         47         27
diagnoses <- sapply(ids_removed, function(id) {All$Diag[All$ID == id]}, USE.NAMES = F)
sort(table(diagnoses), decreasing = T)
## diagnoses
## Control      AD 
##     516     339

Then, we can keep only the cases and SNPs that passed the QC procedure in the dataset…

All <- All[All$ID %in% ids_remain,]
variables <- c("ID", "Diag", "Sex", "E4status", "Age_to_use", "Age82", "Centre", "Region")
All <- All[,c(variables, snps_remain)]

And in the list and vectors of SNPs.

list_of_objects <- lapply(list_of_objects, function(SNP) {if (SNP$id %in% snps_remain) {SNP}})
list_of_objects[sapply(list_of_objects, is.null)] <- NULL
names(list_of_objects) <- sapply(list_of_objects, function(SNP) {SNP$id})
class(list_of_objects) <- "SNP_set"
list_of_objects
## This is just a convenience function, invocated when print() is used, to provide an overview of the genes and snps present in this list. To view its contents, use list_of_objects$your_snp_id
## 
## 
## $snps
##  [1] "rs5749131"  "rs5753231"  "rs16988828" "rs9606756"  "rs757874"  
##  [6] "rs1801198"  "rs5749135"  "rs9621049"  "rs7289553"  "rs5997711" 
## [11] "rs1801394"  "rs13181011" "rs326120"   "rs1532268"  "rs7703033" 
## [16] "rs6555501"  "rs162035"   "rs3815743"  "rs162040"   "rs10475399"
## [21] "rs17585355" "rs2228604"  "rs7536292"  "rs17646665" "rs1149175" 
## [26] "rs10745354" "rs6265"     "rs11030102" "rs11030104" "rs11030119"
## [31] "rs12288512" "rs16961153" "rs1979277"  "rs669340"   "rs643333"  
## [36] "rs9901160"  "rs4975002"  "rs4975009"  "rs6851075"  "rs7412"    
## [41] "rs597668"   "rs1800977"  "rs2422493"  "rs7157609"  "rs4900442" 
## [46] "rs1611115"  "rs4945261"  "rs2373115"  "rs2236707"  "rs4800488" 
## [51] "rs2695121"  "rs1187323"  "rs1545285"  "rs7946"     "rs12325817"
## [56] "rs731236"   "rs7975232"  "rs744373"   "rs3865444"  "rs2069442" 
## [61] "rs2071746"  "rs3931914"  "rs1800896"  "rs1799986"  "rs11754661"
## [66] "rs1799990"  "rs1051266"  "rs13022344"
## 
## $genes
##  [1] "TCN2"    "MTRR"    "SORT1"   "BDNF"    "SHMT1"   "RFC1"    "APOE"   
##  [8] "ABCA1"   "CYP46A1" "DBH"     "GAB2"    "NPC1"    "NR1H2"   "NTRK2"  
## [15] "PEMT"    "VDR"     "BIN1"    "CD33"    "CDK5"    "HMDX1"   "HMGCR"  
## [22] "IL10"    "LRP1"    "MTHFD1L" "PRNP"    "SLC19A1" "TRAK2"  
## 
## 
## The list has a total of 68 snps and 27 genes
vector_of_snps <- snps_remain


1.5 Defining inheritance models


The next step is to obtaining all possible snp inheritance model combinations and adding the new columns to the dataset.

recessive <- paste0(vector_of_snps, "r")
additive <- paste0(vector_of_snps, "a")
dominant <- paste0(vector_of_snps, "d")
inheritance <- c(recessive, additive, dominant)
All[inheritance] <- NA

Then we define the variables (columns) order in the dataset.

col_order <- c("ID", "Diag", "Sex", "Age_to_use", "Age82", "Region",  "E4status", "Centre", vector_of_snps, inheritance)
 All <- All[,col_order]

Finally we code the inheritance with numbers depending on the genotype values for each snp. (0 reference, 1 risk allele effect, 2 two risk alleles effect {just for the additive model})

All <- generate_inheritances(snps = list_of_objects, data = All)


1.6 Perform data conversions and reference diagnosis


We convert all categorical variables into the factor class.

factors <- -c(1,4)
All[,factors] <- lapply(All[,factors], factor)

And before doing any analysis, nor subsetting the dataset we should make sure the contrasts are performed with respect to the controls, not the Alzheimer cases.

All$Diag <- relevel(All$Diag, ref = "Control")


1.7 Creating data subsets


We create a data subset for each of the regions.

for (region in levels(All$Region)) {
  subset <- subset(All, Region == region)
  assign(region, subset)
}

And for each of the centers.

for (center in levels(All$Centre)) {
  subset <- subset(All, Centre == center)
  center <- str_to_title(center)
  assign(center, subset)
}


2 Analysis


2.1 Main effects


First, we select the datasets into which we want to perform the analysis and then all covariates we want to control for.

DATASETS <- c("All", "N.Eur", "Spain")
variables <- c("Sex", "E4status", "Age82", "Centre")

Then we reorder the datasets alphabetically and set Santander as the reference level in the Spanish instead of Madrid as the former has a higher N and also making sure the analysis are performed with respect to the Control reference level not the Diagnosed ones.

for (i in seq_along(DATASETS)) {
  assign(DATASETS[i], get(DATASETS[i])[,order(names(get(DATASETS[i])))])
}
Spain$Centre <- relevel(Spain$Centre, ref = 6)

Finally we perform the main effects analysis for the selected SNPs.

master_list <- perform_analysis(.mode = "main_effects", .data = DATASETS, snps = list_of_objects, covariates = variables)

This function outputs a list with the results of the glm contained in it (best model of inheritance and glm results). An example of the structure of the list with only the main effects performed can be seen here, consisting in the first snp of the All (N.Eur + Spain) dataset:

## $rs5749131
## $rs5749131$Gene
## [1] "TCN2"
## 
## $rs5749131$Best_model
##                  rs5749131d 
## "[1] AIC quasi= 7397.08954" 
## 
## $rs5749131$Main_effects
## $rs5749131$Main_effects$rs5749131a
##                  Estimate Std. Error    OR lower upper     Pr(>|t|)
## (Intercept)         0.627      0.161 1.873 1.365 2.568 1.004591e-04
## SexMale            -0.559      0.061 0.572 0.507 0.645 9.122544e-20
## E4statusE4+         1.086      0.060 2.963 2.632 3.336 1.228676e-70
## Age82>82            0.310      0.061 1.364 1.209 1.539 4.482749e-07
## CentreMADRID       -0.843      0.182 0.431 0.301 0.615 3.665023e-06
## CentreNOTTINGHAM   -0.509      0.196 0.601 0.410 0.882 9.323218e-03
## CentreOPTIMA       -0.544      0.179 0.580 0.408 0.824 2.397197e-03
## CentreOVIEDO        0.333      0.204 1.395 0.936 2.079 1.016127e-01
## CentreSANTANDER    -0.071      0.195 0.932 0.636 1.365 7.161204e-01
## CentreROTTERDAM    -2.377      0.152 0.093 0.069 0.125 5.021860e-54
## rs5749131a1        -0.101      0.065 0.904 0.796 1.027 1.224832e-01
## rs5749131a2        -0.209      0.085 0.811 0.686 0.959 1.402907e-02
## 
## $rs5749131$Main_effects$rs5749131d
##                  Estimate Std. Error    OR lower upper     Pr(>|t|)
## (Intercept)         0.624      0.161 1.867 1.361 2.560 1.076672e-04
## SexMale            -0.559      0.061 0.572 0.507 0.645 8.643812e-20
## E4statusE4+         1.086      0.060 2.964 2.633 3.337 9.484831e-71
## Age82>82            0.311      0.061 1.365 1.210 1.540 4.096063e-07
## CentreMADRID       -0.835      0.182 0.434 0.304 0.619 4.342755e-06
## CentreNOTTINGHAM   -0.508      0.195 0.601 0.410 0.882 9.291370e-03
## CentreOPTIMA       -0.539      0.179 0.583 0.410 0.828 2.604900e-03
## CentreOVIEDO        0.337      0.203 1.400 0.940 2.086 9.805458e-02
## CentreSANTANDER    -0.067      0.195 0.935 0.638 1.369 7.298276e-01
## CentreROTTERDAM    -2.375      0.152 0.093 0.069 0.125 5.561420e-54
## rs5749131d1        -0.130      0.062 0.878 0.778 0.991 3.482203e-02
## 
## $rs5749131$Main_effects$rs5749131r
##                  Estimate Std. Error    OR lower upper     Pr(>|t|)
## (Intercept)         0.572      0.157 1.772 1.302 2.411 2.741695e-04
## SexMale            -0.560      0.061 0.571 0.507 0.644 8.536540e-20
## E4statusE4+         1.085      0.060 2.961 2.630 3.333 1.514202e-70
## Age82>82            0.309      0.061 1.362 1.207 1.536 5.061035e-07
## CentreMADRID       -0.850      0.182 0.427 0.299 0.610 2.970276e-06
## CentreNOTTINGHAM   -0.512      0.196 0.599 0.409 0.879 8.853409e-03
## CentreOPTIMA       -0.553      0.179 0.575 0.405 0.817 2.040860e-03
## CentreOVIEDO        0.329      0.204 1.390 0.933 2.072 1.054753e-01
## CentreSANTANDER    -0.079      0.195 0.924 0.631 1.353 6.833261e-01
## CentreROTTERDAM    -2.380      0.152 0.093 0.069 0.125 3.669116e-54
## rs5749131r1        -0.149      0.076 0.862 0.743 1.000 4.958275e-02

We can appreciate that the main effects analysis were also performed for the Spanish and Northern European subsets.

names(master_list)
## [1] "All"   "N.Eur" "Spain"

Then, selecting a threshold of 0.05, before applying multiple testing correction, we can obtain the following results for the snps under the threshold and to which genes it belongs for the whole dataset.

print_ME(master_list)
## 
##  rs5749131d1  rs5997711r1 rs11030102d1 rs11030119d1 rs12288512d1     rs7412d1 
##        0.035        0.021        0.000        0.004        0.002        0.000 
##  rs2422493r1  rs4945261d1  rs2373115d1   rs744373a2 rs13022344r1 
##        0.045        0.022        0.022        0.000        0.043 
## 
## 
## sig_genes
##  BDNF  GAB2  TCN2 ABCA1  APOE  BIN1 TRAK2 
##     3     2     2     1     1     1     1
print_ME(master_list, corrected = T)
## 
##   rs7412d1 rs744373a2 
##      0.010      0.047 
## 
## 
## sig_genes
## APOE BIN1 
##    1    1

We really have 3 snp models which are found significantly associated with AD, even after multiple testing correction.

Those are the APOE gene, for which has the E4 variant was found to be the largest known genetic risk factor for late-onset sporadic AD. The rationale is that this isoform APOE ε4 is not as effective as the others at promoting these reactions, resulting in increased vulnerability to AD in individuals with that gene variation

The BDNF snp (rs11030102) has been linked to decreased BDNF serum levels. In the adult brain, BDNF maintains high expression levels and regulates both excitatory and inhibitory synaptic transmission and activity-dependent plasticity (Miranda et al., 2019)

The bridging integrator 1 (BIN1) gene is the second most important susceptibility gene for late-onset Alzheimer’s disease (LOAD) after apolipoprotein E (APOE) gene. (Hu et al., 2021). BIN1 encodes a Myc-interacting protein but its role in AD is still unclear.

To see a count of the number of genes studied, according to the number of snps we can use the following code:

genes_studied <- sapply(master_list$All, function(snp) {snp$Gene})
genes_studied <- unname(genes_studied)
genes_studied <- table(genes_studied)
sort(genes_studied, decreasing = T)
## genes_studied
##    MTRR    TCN2   SORT1    BDNF   SHMT1    RFC1   ABCA1    APOE CYP46A1    GAB2 
##      10      10       6       5       5       3       2       2       2       2 
##    NPC1   NTRK2    PEMT     VDR    BIN1    CD33    CDK5     DBH   HMDX1   HMGCR 
##       2       2       2       2       1       1       1       1       1       1 
##    IL10    LRP1 MTHFD1L   NR1H2    PRNP SLC19A1   TRAK2 
##       1       1       1       1       1       1       1

Here we can see that from the 75 SNPs studied, all 3 APOE snps were found significant as well as the only SNP studied for BIN1. For BDNF, though just one out of five SNPs was deemed significant according to main effects, snp rs11030102.

To measure the strength of the association found we can extract the OR from the main effects results.

significant_snps <- c("rs7412d1", "rs744373a2")
OR_results <- sapply(significant_snps, function(snp_model_lvl){
  snp_model <- sub("[0-2]$", "", snp_model_lvl);
  snp <- sub("[a-z]$", "", snp_model);
  glm_results <- master_list$All[[snp]]$Main_effects[[snp_model]]
  index <- grep(rownames(glm_results), pattern = snp_model_lvl)
  glm_results[index,][3]
})
snps <- sub(names(OR_results), pattern = "[a-z][0-2].OR$", replacement = "")
genes <- sapply(snps, function(snp) {list_of_objects[[snp]]$gene})
OR_results2 <- OR_results
names(OR_results2) <- genes
sort(OR_results, decreasing = T)
## rs744373a2.OR   rs7412d1.OR 
##         1.466         0.693
sort(OR_results2, decreasing = T)
##  BIN1  APOE 
## 1.466 0.693

On the other hand, the others, hold more moderate effects, with the dominant model of snp rs7412d, which also corresponds to APOE, increases the odds of having the disease 0.682. Or what is the same, decreases it by 1/0.682 = 1.4662757

We can generate a dataframe to plot these results:

plot_df <- data.frame(matrix(NA, nrow = length(significant_snps), ncol = 6))
colnames(plot_df) <- c("snp", "gene", "OR", "lower", "upper", "pvalue")
for (i in seq_along(significant_snps)) {
  snp_model_lvl <- significant_snps[i]
  snp_model <- sub(snp_model_lvl, pattern = "[0-2]$", replacement = "")
  snp <- sub(snp_model, pattern = "[a-z]$", replacement = "")
  SNP <- master_list$All[[snp]]
  gene <- SNP$Gene
  main_effects <- SNP$Main_effects[[snp_model]]
  snp_term <- grep(pattern = snp_model_lvl, x = rownames(main_effects))
  OR <- main_effects[snp_term, 3]
  lower <- main_effects[snp_term, 4]
  upper <- main_effects[snp_term, 5]
  pvalue <- main_effects[snp_term, 6]
  plot_df[i,1] <- snp_model_lvl
  plot_df[i,2] <- gene
  plot_df[i,3] <- OR
  plot_df[i,4] <- lower
  plot_df[i,5] <- upper
  plot_df[i,6] <- pvalue
}

And plot them:

## Warning: geom_vline(): Ignoring `mapping` because `xintercept` was provided.


2.2 Covariate interaction


The next step in the analysis would be to examine all possible interactions of the 75 studied SNPs with the covariates Age82, Sex & E4status.

In practice this means repeating the models from the main effects but by adding one interaction term at a time.

In order to reduce the multiple testing performed and due to the previous findings of what the best models were, only the best snp models chosen for each main effect were employed in this subsequent step.

master_list <- perform_analysis(.mode = "interaction", .data = DATASETS, snps = list_of_objects, covariates = variables)

And here we can see an example of the kind of results obtained by doing this analysis.

master_list$All[[1]]$Interactions$Other_covariates
## $Age82
## $Age82$Name
## [1] "rs5749131d1:Age82>82"
## 
## $Age82$Summary
##                      Estimate Std. Error    OR lower upper     Pr(>|t|)
## (Intercept)             0.635      0.166 1.887 1.363 2.613 1.314493e-04
## SexMale                -0.559      0.061 0.572 0.507 0.645 8.628824e-20
## E4statusE4+             1.086      0.060 2.962 2.631 3.335 1.350355e-70
## CentreMADRID           -0.834      0.182 0.434 0.304 0.620 4.534412e-06
## CentreNOTTINGHAM       -0.507      0.195 0.602 0.411 0.883 9.515261e-03
## CentreOPTIMA           -0.538      0.179 0.584 0.411 0.830 2.699116e-03
## CentreOVIEDO            0.338      0.204 1.403 0.941 2.090 9.642355e-02
## CentreSANTANDER        -0.065      0.195 0.937 0.640 1.373 7.390780e-01
## CentreROTTERDAM        -2.374      0.152 0.093 0.069 0.126 7.240989e-54
## rs5749131d1            -0.148      0.090 0.863 0.723 1.029 1.011110e-01
## Age82>82                0.289      0.103 1.335 1.091 1.633 5.032564e-03
## rs5749131d1:Age82>82    0.034      0.124 1.034 0.812 1.317 7.855417e-01
## 
## $Age82$SF
## [1] 1.034
## 
## $Age82$Significant
## [1] FALSE
## 
## 
## $Sex
## $Sex$Name
## [1] "rs5749131d1:SexMale"
## 
## $Sex$Summary
##                     Estimate Std. Error    OR lower upper     Pr(>|t|)
## (Intercept)            0.631      0.165 1.880 1.361 2.597 1.290372e-04
## E4statusE4+            1.087      0.060 2.964 2.633 3.337 9.654106e-71
## Age82>82               0.311      0.061 1.365 1.210 1.540 4.138155e-07
## CentreMADRID          -0.836      0.182 0.433 0.303 0.619 4.274699e-06
## CentreNOTTINGHAM      -0.509      0.195 0.601 0.410 0.882 9.251647e-03
## CentreOPTIMA          -0.540      0.179 0.583 0.410 0.828 2.576107e-03
## CentreOVIEDO           0.336      0.203 1.399 0.939 2.085 9.848828e-02
## CentreSANTANDER       -0.068      0.195 0.934 0.638 1.368 7.258487e-01
## CentreROTTERDAM       -2.376      0.152 0.093 0.069 0.125 5.646864e-54
## rs5749131d1           -0.140      0.078 0.870 0.747 1.013 7.208241e-02
## SexMale               -0.577      0.106 0.562 0.457 0.691 4.830146e-08
## rs5749131d1:SexMale    0.026      0.127 1.026 0.799 1.317 8.394594e-01
## 
## $Sex$SF
## [1] 1.026
## 
## $Sex$Significant
## [1] FALSE
## 
## 
## $E4status
## $E4status$Name
## [1] "rs5749131d1:E4statusE4+"
## 
## $E4status$Summary
##                         Estimate Std. Error    OR lower upper     Pr(>|t|)
## (Intercept)                0.647      0.165 1.909 1.382 2.636 8.710879e-05
## SexMale                   -0.559      0.061 0.572 0.507 0.645 9.635296e-20
## Age82>82                   0.310      0.061 1.363 1.208 1.538 4.797888e-07
## CentreMADRID              -0.838      0.182 0.433 0.303 0.618 4.120560e-06
## CentreNOTTINGHAM          -0.508      0.195 0.602 0.410 0.882 9.339027e-03
## CentreOPTIMA              -0.542      0.179 0.582 0.410 0.827 2.515097e-03
## CentreOVIEDO               0.334      0.203 1.396 0.937 2.080 1.010743e-01
## CentreSANTANDER           -0.067      0.195 0.935 0.638 1.370 7.304117e-01
## CentreROTTERDAM           -2.376      0.152 0.093 0.069 0.125 5.205773e-54
## rs5749131d1               -0.161      0.078 0.852 0.731 0.991 3.820745e-02
## E4statusE4+                1.030      0.105 2.802 2.280 3.444 1.636899e-22
## rs5749131d1:E4statusE4+    0.083      0.127 1.087 0.846 1.395 5.149966e-01
## 
## $E4status$SF
## [1] 1.087
## 
## $E4status$Significant
## [1] FALSE

First, for the variable “Name” we have the interaction studied, then as “Summary” the full glm model results obtained, and the rest of variables, extract the most interesting results for the interaction, the “SF” and whether it is found “Significant” or not.

The SF is a statistic used to measure interactions in complex diseases. It is defined as the OR of the interaction divided by the product of the OR of each of the interaction on its own.

Thus, it makes a good and easy to understand metric for what we are measuring as if there was no interaction effect we would expect the SF to be close to 1, as the impact of the interaction should be about the same as the effect of each of the members on its on. However, if the interaction is found greater or lower than 1, that means we are having a synergistic effect in which the odds on having the disease is increased or reduced respectively.

Having said that, we can obtain the significant results found for this snp-covariate interaction measured:

cov <- setdiff(variables, c("Centre", "E4status"))
print_INT(master_list, covariates = cov)
## 
##  rs5997711r1:E4statusE4+     rs13181011r1:SexMale      rs326120r1:Age82>82 
##                    0.038                    0.042                    0.019 
##   rs162040d1:E4statusE4+     rs11030102d1:SexMale     rs11030119d1:SexMale 
##                    0.016                    0.001                    0.023 
##     rs12288512d1:SexMale     rs16961153r1:SexMale rs16961153r1:E4statusE4+ 
##                    0.001                    0.025                    0.021 
##      rs6851075r1:SexMale        rs7412d1:Age82>82  rs2069442r1:E4statusE4+ 
##                    0.050                    0.012                    0.005 
##     rs11754661d1:SexMale 
##                    0.042 
## 
## 
## [1] "Sex"
## sig_genes
##    BDNF MTHFD1L    MTRR    RFC1   SHMT1 
##       3       1       1       1       1 
## 
## 
## [1] "Age82"
## sig_genes
## APOE MTRR 
##    1    1

However, after applying multiple testing correction no significant associations are found.

print_INT(master_list, covariates = cov, corrected = T)
## No significant interactions found


2.3 SNP-SNP interaction


2.3.1 Reported

Get the SNP-SNP interactions dataset.

snp_snp <- read.csv2(file = file.path(input, "interactions.csv"))
rownames(snp_snp) <- NULL

Separate the datasets into 2, one with the APOE interactions and the other with the rest.

snp_APOE <- snp_snp[snp_snp$gene2 == "APOE",]
snp_snp <- snp_snp[snp_snp$gene2 != "APOE",]

Recode the model into a format "rs[0-9]*[a,d,r]", and replace unknown with the best model found for main effects.

snp_APOE$model1 <- get_model(snp_APOE$snp1, snp_APOE$model1, master_list)
snp_APOE$model2 <- "E4status"
snp_snp$model1 <- get_model(snp_snp$snp1, snp_snp$model1, master_list)
snp_snp$model2 <- get_model(snp_snp$snp2, snp_snp$model2, master_list)

Get a list of the interactions found in it.

list_of_snp_pairs <- generate_list_of_snp_pairs(mode = "reported_model",
                                                snp_df = snp_snp)
list_of_APOE <- generate_list_of_snp_pairs(mode = "reported_model",
                                           snp_df = snp_APOE)
list_of_snp_pairs
## [[1]]
## [1] "rs13022344d" "rs597668d"  
## 
## [[2]]
## [1] "rs13022344d" "rs744373d"  
## 
## [[3]]
## [1] "rs2071746d" "rs2695121r"
## 
## [[4]]
## [1] "rs1800977r" "rs3931914d"
## 
## [[5]]
## [1] "rs2422493r" "rs3931914d"
## 
## [[6]]
## [1] "rs2422493r" "rs4800488r"
## 
## [[7]]
## [1] "rs2422493r" "rs2236707r"
## 
## [[8]]
## [1] "rs7975232r" "rs1800896d"
## 
## [[9]]
## [1] "rs731236d"  "rs1800896d"
## 
## [[10]]
## [1] "rs7157609r" "rs4900442r"
list_of_APOE
## [[1]]
## [1] "rs2069442r" "E4status"  
## 
## [[2]]
## [1] "rs1800977r" "E4status"  
## 
## [[3]]
## [1] "rs597668d" "E4status" 
## 
## [[4]]
## [1] "rs744373d" "E4status" 
## 
## [[5]]
## [1] "rs1799990r" "E4status"  
## 
## [[6]]
## [1] "rs2373115d" "E4status"

Creating the interactions containers in the master_list

master_list <- store_interactions(snp_dataset = snp_snp, master_list = master_list)
## Warning in master_list[[dataset]][[snp]]$Interactions$SNP$Reported <-
## interactions_list[[snp]]: Coercing LHS to a list

## Warning in master_list[[dataset]][[snp]]$Interactions$SNP$Reported <-
## interactions_list[[snp]]: Coercing LHS to a list

## Warning in master_list[[dataset]][[snp]]$Interactions$SNP$Reported <-
## interactions_list[[snp]]: Coercing LHS to a list

## Warning in master_list[[dataset]][[snp]]$Interactions$SNP$Reported <-
## interactions_list[[snp]]: Coercing LHS to a list

## Warning in master_list[[dataset]][[snp]]$Interactions$SNP$Reported <-
## interactions_list[[snp]]: Coercing LHS to a list

## Warning in master_list[[dataset]][[snp]]$Interactions$SNP$Reported <-
## interactions_list[[snp]]: Coercing LHS to a list

## Warning in master_list[[dataset]][[snp]]$Interactions$SNP$Reported <-
## interactions_list[[snp]]: Coercing LHS to a list

## Warning in master_list[[dataset]][[snp]]$Interactions$SNP$Reported <-
## interactions_list[[snp]]: Coercing LHS to a list

## Warning in master_list[[dataset]][[snp]]$Interactions$SNP$Reported <-
## interactions_list[[snp]]: Coercing LHS to a list

## Warning in master_list[[dataset]][[snp]]$Interactions$SNP$Reported <-
## interactions_list[[snp]]: Coercing LHS to a list

## Warning in master_list[[dataset]][[snp]]$Interactions$SNP$Reported <-
## interactions_list[[snp]]: Coercing LHS to a list

## Warning in master_list[[dataset]][[snp]]$Interactions$SNP$Reported <-
## interactions_list[[snp]]: Coercing LHS to a list

## Warning in master_list[[dataset]][[snp]]$Interactions$SNP$Reported <-
## interactions_list[[snp]]: Coercing LHS to a list

## Warning in master_list[[dataset]][[snp]]$Interactions$SNP$Reported <-
## interactions_list[[snp]]: Coercing LHS to a list

## Warning in master_list[[dataset]][[snp]]$Interactions$SNP$Reported <-
## interactions_list[[snp]]: Coercing LHS to a list

## Warning in master_list[[dataset]][[snp]]$Interactions$SNP$Reported <-
## interactions_list[[snp]]: Coercing LHS to a list

## Warning in master_list[[dataset]][[snp]]$Interactions$SNP$Reported <-
## interactions_list[[snp]]: Coercing LHS to a list

## Warning in master_list[[dataset]][[snp]]$Interactions$SNP$Reported <-
## interactions_list[[snp]]: Coercing LHS to a list

## Warning in master_list[[dataset]][[snp]]$Interactions$SNP$Reported <-
## interactions_list[[snp]]: Coercing LHS to a list

## Warning in master_list[[dataset]][[snp]]$Interactions$SNP$Reported <-
## interactions_list[[snp]]: Coercing LHS to a list

## Warning in master_list[[dataset]][[snp]]$Interactions$SNP$Reported <-
## interactions_list[[snp]]: Coercing LHS to a list

Performing the analysis

master_list <- perform_analysis(.mode = "snp",
                                .submode = "reported_model",
                                .data = DATASETS,
                                snps = list_of_snp_pairs,
                                covariates = variables,
                                .verbose = T)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |==============                                                        |  21%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |===================                                                   |  28%
  |                                                                            
  |======================                                                |  31%
  |                                                                            
  |========================                                              |  34%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |=============================                                         |  41%
  |                                                                            
  |===============================                                       |  45%
  |                                                                            
  |==================================                                    |  48%
  |                                                                            
  |====================================                                  |  52%
  |                                                                            
  |=======================================                               |  55%
  |                                                                            
  |=========================================                             |  59%
  |                                                                            
  |===========================================                           |  62%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |================================================                      |  69%
  |                                                                            
  |===================================================                   |  72%
  |                                                                            
  |=====================================================                 |  76%
  |                                                                            
  |========================================================              |  79%
  |                                                                            
  |==========================================================            |  83%
  |                                                                            
  |============================================================          |  86%
  |                                                                            
  |===============================================================       |  90%
  |                                                                            
  |=================================================================     |  93%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |======================================================================| 100%

Printing the results

print_SNP_SNP(master_list, list_of_snp_pairs, list_of_APOE)
## 
## rs7157609r1:rs4900442r1 rs2069442r1:E4statusE4+ 
##                   0.000                   0.005 
## 
## 
## sig_genes
##       CDK5-APOE CYP46A1-CYP46A1 
##               1               1
print_SNP_SNP(master_list, list_of_snp_pairs, list_of_APOE, corrected = T)
## 
## rs7157609r1:rs4900442r1 
##                   0.001 
## 
## 
## CYP46A1-CYP46A1 
##               1

We can plot this interaction but by center.

CYP46A1_interaction <- vector(mode = "list", length = length(levels(All$Centre)))
for (centre in str_to_title(levels(All$Centre))) {
  snp1_model <- "rs7157609r"
  snp2_model <- "rs4900442r"
  covariates <- c("Sex", "Age82", "E4status")
  interaction <- paste0(snp1_model, "*", snp2_model)
  predictors <- c(covariates, interaction)
  formula <- as.formula(paste("Diag", paste(predictors, collapse = " + "), sep = " ~ "))
  args <-  list(formula = formula, family = quasibinomial("logit"), data = as.name(centre))
  linear_model <- do.call(glm, args)
  OR_summary <- Coeff.OR2(linear_model)
  cat("\n")
  print(centre)
  print(OR_summary)
  i <- which(str_to_title(levels(All$Centre)) == centre)
  CYP46A1_interaction[[i]] <- OR_summary
  names(CYP46A1_interaction)[i] <- centre
}
## 
## [1] "Bristol"
##                         Estimate Std. Error       OR   lower    upper
## (Intercept)             -0.03045    0.35927  0.97001 0.47969  1.96151
## SexMale                 -0.18704    0.34851  0.82941 0.41890  1.64221
## Age82>82                 0.02347    0.35298  1.02375 0.51254  2.04484
## E4statusE4+              2.64817    0.40538 14.12820 6.38298 31.27162
## rs7157609r1             -0.63006    0.78831  0.53256 0.11359  2.49686
## rs4900442r1             -0.00007    0.50460  0.99993 0.37192  2.68837
## rs7157609r1:rs4900442r1  0.80337    1.14392  2.23305 0.23723 21.01943
##                             Pr(>|t|)
## (Intercept)             9.325336e-01
## SexMale                 5.920071e-01
## Age82>82                9.470403e-01
## E4statusE4+             4.104252e-10
## rs7157609r1             4.249711e-01
## rs4900442r1             9.998895e-01
## rs7157609r1:rs4900442r1 4.832051e-01
## 
## [1] "Madrid"
##                         Estimate Std. Error      OR   lower    upper
## (Intercept)             -0.34089    0.17543 0.71114 0.50423  1.00296
## SexMale                 -0.31897    0.23052 0.72690 0.46265  1.14208
## Age82>82                -0.30332    0.25176 0.73836 0.45079  1.20940
## E4statusE4+              1.69970    0.25111 5.47233 3.34523  8.95198
## rs7157609r1             -0.33175    0.63237 0.71767 0.20780  2.47860
## rs4900442r1             -0.39936    0.36058 0.67075 0.33085  1.35985
## rs7157609r1:rs4900442r1  0.98453    0.90893 2.67654 0.45069 15.89529
##                             Pr(>|t|)
## (Intercept)             5.272160e-02
## SexMale                 1.672540e-01
## Age82>82                2.290165e-01
## E4statusE4+             4.841628e-11
## rs7157609r1             6.001507e-01
## rs4900442r1             2.687462e-01
## rs7157609r1:rs4900442r1 2.794075e-01
## 
## [1] "Nottingham"
##                         Estimate Std. Error       OR   lower    upper
## (Intercept)             -0.41956    0.31418  0.65734 0.35510  1.21682
## SexMale                 -0.41392    0.28571  0.66105 0.37760  1.15728
## Age82>82                 0.70019    0.28802  2.01414 1.14531  3.54204
## E4statusE4+              1.68982    0.29524  5.41850 3.03786  9.66474
## rs7157609r1             -0.91923    0.66836  0.39883 0.10761  1.47811
## rs4900442r1             -0.66756    0.43288  0.51296 0.21959  1.19828
## rs7157609r1:rs4900442r1  2.54708    0.96496 12.76970 1.92660 84.63869
##                             Pr(>|t|)
## (Intercept)             1.828825e-01
## SexMale                 1.485830e-01
## Age82>82                1.571232e-02
## E4statusE4+             2.797494e-08
## rs7157609r1             1.701788e-01
## rs4900442r1             1.242252e-01
## rs7157609r1:rs4900442r1 8.789293e-03
## 
## [1] "Optima"
##                         Estimate Std. Error       OR   lower    upper
## (Intercept)             -0.62762    0.23284  0.53386 0.33824  0.84262
## SexMale                 -0.14355    0.22565  0.86628 0.55665  1.34814
## Age82>82                 0.30978    0.23067  1.36313 0.86734  2.14231
## E4statusE4+              2.06097    0.23308  7.85357 4.97351 12.40140
## rs7157609r1             -1.37584    0.60253  0.25263 0.07755  0.82292
## rs4900442r1             -0.31941    0.29480  0.72658 0.40770  1.29486
## rs7157609r1:rs4900442r1  2.74865    0.77739 15.62147 3.40406 71.68798
##                             Pr(>|t|)
## (Intercept)             7.311692e-03
## SexMale                 5.250180e-01
## Age82>82                1.800036e-01
## E4statusE4+             2.613102e-17
## rs7157609r1             2.290212e-02
## rs4900442r1             2.792225e-01
## rs7157609r1:rs4900442r1 4.518687e-04
## 
## [1] "Oviedo"
##                         Estimate Std. Error       OR   lower     upper
## (Intercept)              0.52828    0.22416  1.69601 1.09299   2.63172
## SexMale                 -0.12607    0.29733  0.88156 0.49222   1.57885
## Age82>82                 3.11393    1.03902 22.50927 2.93719 172.50096
## E4statusE4+              0.74632    0.32886  2.10922 1.10711   4.01840
## rs7157609r1             -0.94419    0.79781  0.38900 0.08144   1.85804
## rs4900442r1              0.28415    0.45425  1.32863 0.54543   3.23642
## rs7157609r1:rs4900442r1  1.51544    1.20898  4.55142 0.42564  48.66856
##                            Pr(>|t|)
## (Intercept)             0.019152615
## SexMale                 0.671902012
## Age82>82                0.002979355
## E4statusE4+             0.024029937
## rs7157609r1             0.237658874
## rs4900442r1             0.532144484
## rs7157609r1:rs4900442r1 0.211108446
## 
## [1] "Santander"
##                         Estimate Std. Error      OR   lower    upper
## (Intercept)              0.68813    0.23384 1.98999 1.25836  3.14702
## SexMale                 -0.25456    0.28765 0.77526 0.44116  1.36239
## Age82>82                -0.95175    0.28291 0.38606 0.22174  0.67217
## E4statusE4+              1.77570    0.32654 5.90440 3.11329 11.19779
## rs7157609r1              0.28540    0.88403 1.33029 0.23520  7.52399
## rs4900442r1             -0.38858    0.44284 0.67802 0.28464  1.61509
## rs7157609r1:rs4900442r1 -1.88128    1.41850 0.15239 0.00945  2.45708
##                             Pr(>|t|)
## (Intercept)             3.511210e-03
## SexMale                 3.769086e-01
## Age82>82                8.693889e-04
## E4statusE4+             1.132204e-07
## rs7157609r1             7.470494e-01
## rs4900442r1             3.809449e-01
## rs7157609r1:rs4900442r1 1.857830e-01
## 
## [1] "Rotterdam"
##                         Estimate Std. Error      OR   lower   upper
## (Intercept)             -1.72576    0.07470 0.17804 0.15379 0.20611
## SexMale                 -0.73033    0.07818 0.48175 0.41331 0.56153
## Age82>82                 0.42131    0.07486 1.52395 1.31598 1.76480
## E4statusE4+              0.77592    0.07450 2.17259 1.87742 2.51416
## rs7157609r1             -0.24360    0.19420 0.78380 0.53568 1.14686
## rs4900442r1             -0.12362    0.10507 0.88371 0.71923 1.08581
## rs7157609r1:rs4900442r1  0.50423    0.25059 1.65571 1.01316 2.70578
##                              Pr(>|t|)
## (Intercept)             7.898817e-113
## SexMale                  1.343549e-20
## Age82>82                 1.914210e-08
## E4statusE4+              3.604039e-25
## rs7157609r1              2.097575e-01
## rs4900442r1              2.394325e-01
## rs7157609r1:rs4900442r1  4.425165e-02
 #PLOT 

forest_plot <- as.data.frame(matrix(data = NA, nrow = length(CYP46A1_interaction), ncol = 6))
colnames(forest_plot) <- c("SF", "lower", "upper", "p_value", "Centre", "gene")
for (i in seq_along(CYP46A1_interaction)) {
  results <- CYP46A1_interaction[[i]]
  results <- tail(results, n = 1)
  forest_plot[i, 1] <- results[3]
  forest_plot[i, 2] <- results[4]
  forest_plot[i, 3] <- results[5]
  forest_plot[i, 4] <- as.character(ifelse(results[6] >= 0.005, round(results[6], digits = 2),"< 0.005"))
  forest_plot[i, 5] <- names(CYP46A1_interaction)[i]
  forest_plot[i, 6] <- "CYP46A1*CYP46A1"
}
p <- ggplot(data = forest_plot, aes(x = SF, y = Centre, xmin = lower, xmax = upper))
p <- p + geom_pointrange(aes(col = Centre))
p <- p + geom_vline(aes(fill = Centre), xintercept = 1, linetype = "dotted")
## Warning: geom_vline(): Ignoring `mapping` because `xintercept` was provided.
p <- p + geom_errorbar(aes(xmin = lower, xmax = upper, col = Centre),width = 0.5, cex = 1)
p <- p + facet_wrap(~gene, strip.position = "left", nrow = 1, scales = "free_y")
p <- p + geom_text(aes(label = p_value), hjust = -0.5, vjust = -1)
p <- p+theme(axis.title.y=element_blank(),
             axis.text.y=element_blank(),
             axis.ticks.y=element_blank())

p

Doing the same but without Optima to ascertain that the signal doesn’t come only from there.

no_optima <- All[All$Centre != "NOTTINGHAM",]
formula <- as.formula(Diag ~ Sex + Centre + Age82 + E4status + rs7157609r * rs4900442r)
linear_model <- glm(formula = formula, family = quasibinomial("logit"), data = no_optima)
OR_summary <- Coeff.OR2(linear_model)
OR_summary
##                         Estimate Std. Error      OR   lower   upper
## (Intercept)              0.55383    0.15831 1.73990 1.27574 2.37292
## SexMale                 -0.55980    0.06298 0.57133 0.50498 0.64639
## CentreMADRID            -0.80888    0.18191 0.44536 0.31179 0.63614
## CentreOPTIMA            -0.52162    0.17957 0.59356 0.41746 0.84394
## CentreOVIEDO             0.34359    0.20456 1.41000 0.94427 2.10542
## CentreSANTANDER         -0.05130    0.19550 0.95000 0.64760 1.39360
## CentreROTTERDAM         -2.35592    0.15292 0.09481 0.07025 0.12794
## Age82>82                 0.29592    0.06307 1.34436 1.18805 1.52124
## E4statusE4+              1.06714    0.06211 2.90704 2.57384 3.28337
## rs7157609r1             -0.38552    0.16625 0.68010 0.49097 0.94206
## rs4900442r1             -0.14805    0.08843 0.86239 0.72516 1.02559
## rs7157609r1:rs4900442r1  0.74107    0.21558 2.09817 1.37510 3.20146
##                             Pr(>|t|)
## (Intercept)             4.711384e-04
## SexMale                 7.721177e-19
## CentreMADRID            8.857666e-06
## CentreOPTIMA            3.685038e-03
## CentreOVIEDO            9.306382e-02
## CentreSANTANDER         7.930341e-01
## CentreROTTERDAM         1.018479e-52
## Age82>82                2.751747e-06
## E4statusE4+             7.107484e-65
## rs7157609r1             2.042292e-02
## rs4900442r1             9.412707e-02
## rs7157609r1:rs4900442r1 5.904397e-04
APOE4_CDK5_interaction <- vector(mode = "list", length = length(levels(All$Centre))-1)
for (centre in str_to_title(levels(All$Centre))) {
  if (centre == "Bristol"){
    next
  }
  snp1_model <- "rs2069442r"
  snp2_model <- "E4status"
  covariates <- c("Sex", "Age82")
  interaction <- paste0(snp1_model, "*", snp2_model)
  predictors <- c(covariates, interaction)
  formula <- as.formula(paste("Diag", paste(predictors, collapse = " + "), sep = " ~ "))
  args <-  list(formula = formula, family = quasibinomial("logit"), data = as.name(centre))
  linear_model <- do.call(glm, args)
  OR_summary <- Coeff.OR2(linear_model)
  cat("\n")
  print(centre)
  print(OR_summary)
  i <- which(str_to_title(levels(All$Centre)) == centre)
  APOE4_CDK5_interaction[[i]] <- OR_summary
  names(APOE4_CDK5_interaction)[i] <- centre
}
## 
## [1] "Madrid"
##                         Estimate Std. Error      OR   lower   upper
## (Intercept)             -0.42209    0.17044 0.65567 0.46947 0.91574
## SexMale                 -0.29434    0.22879 0.74502 0.47580 1.16658
## Age82>82                -0.26994    0.25011 0.76342 0.46760 1.24641
## rs2069442r1              0.56138    0.65398 1.75309 0.48655 6.31656
## E4statusE4+              1.73354    0.25833 5.66067 3.41171 9.39212
## rs2069442r1:E4statusE4+ -1.01309    0.99437 0.36309 0.05171 2.54944
##                             Pr(>|t|)
## (Intercept)             1.369528e-02
## SexMale                 1.990187e-01
## Age82>82                2.811197e-01
## rs2069442r1             3.911940e-01
## E4statusE4+             6.890021e-11
## rs2069442r1:E4statusE4+ 3.089200e-01
## 
## [1] "Nottingham"
##                         Estimate Std. Error      OR   lower    upper
## (Intercept)             -0.46658    0.30258 0.62714 0.34658  1.13481
## SexMale                 -0.51693    0.28392 0.59635 0.34184  1.04034
## Age82>82                 0.70145    0.28993 2.01668 1.14245  3.55987
## rs2069442r1              0.94480    0.61158 2.57229 0.77577  8.52918
## E4statusE4+              1.74744    0.30447 5.73990 3.16035 10.42495
## rs2069442r1:E4statusE4+ -1.62042    1.15742 0.19782 0.02047  1.91194
##                             Pr(>|t|)
## (Intercept)             1.242538e-01
## SexMale                 6.977532e-02
## Age82>82                1.621992e-02
## rs2069442r1             1.235732e-01
## E4statusE4+             2.584897e-08
## rs2069442r1:E4statusE4+ 1.626691e-01
## 
## [1] "Optima"
##                         Estimate Std. Error      OR   lower    upper
## (Intercept)             -0.61016    0.21925 0.54326 0.35349  0.83491
## SexMale                 -0.14485    0.22538 0.86515 0.55621  1.34569
## Age82>82                 0.25175    0.22990 1.28627 0.81967  2.01849
## rs2069442r1             -0.38795    0.69919 0.67845 0.17233  2.67103
## E4statusE4+              2.04259    0.24127 7.71057 4.80522 12.37255
## rs2069442r1:E4statusE4+  0.27336    0.92036 1.31437 0.21642  7.98259
##                             Pr(>|t|)
## (Intercept)             5.630354e-03
## SexMale                 5.207830e-01
## Age82>82                2.741204e-01
## rs2069442r1             5.792908e-01
## E4statusE4+             4.318149e-16
## rs2069442r1:E4statusE4+ 7.666016e-01
## 
## [1] "Oviedo"
##                         Estimate Std. Error       OR   lower     upper
## (Intercept)              0.52487    0.20923  1.69024 1.12162   2.54712
## SexMale                 -0.13342    0.28687  0.87510 0.49873   1.53548
## Age82>82                 3.11450    1.01710 22.52221 3.06788 165.34228
## rs2069442r1              1.03470    0.79510  2.81427 0.59233  13.37116
## E4statusE4+              0.79968    0.33571  2.22483 1.15222   4.29593
## rs2069442r1:E4statusE4+ -1.20038    1.17202  0.30108 0.03027   2.99449
##                            Pr(>|t|)
## (Intercept)             0.012698852
## SexMale                 0.642233903
## Age82>82                0.002415291
## rs2069442r1             0.194227999
## E4statusE4+             0.017895591
## rs2069442r1:E4statusE4+ 0.306641092
## 
## [1] "Santander"
##                         Estimate Std. Error      OR   lower    upper
## (Intercept)              0.61744    0.22975 1.85418 1.18192  2.90881
## SexMale                 -0.26664    0.28592 0.76595 0.43734  1.34146
## Age82>82                -0.91936    0.28094 0.39877 0.22993  0.69162
## rs2069442r1             -0.12380    0.74260 0.88356 0.20612  3.78747
## E4statusE4+              1.76129    0.32953 5.81996 3.05082 11.10255
## rs2069442r1:E4statusE4+ -0.17215    1.36944 0.84185 0.05748 12.32899
##                             Pr(>|t|)
## (Intercept)             7.602214e-03
## SexMale                 3.517886e-01
## Age82>82                1.191066e-03
## rs2069442r1             8.677114e-01
## E4statusE4+             1.798408e-07
## rs2069442r1:E4statusE4+ 9.000454e-01
## 
## [1] "Rotterdam"
##                         Estimate Std. Error      OR   lower   upper
## (Intercept)             -1.76515    0.07368 0.17116 0.14814 0.19775
## SexMale                 -0.72833    0.07818 0.48271 0.41414 0.56265
## Age82>82                 0.41937    0.07483 1.52101 1.31351 1.76128
## rs2069442r1              0.33548    0.15737 1.39861 1.02740 1.90394
## E4statusE4+              0.82213    0.07683 2.27534 1.95725 2.64512
## rs2069442r1:E4statusE4+ -0.77878    0.32055 0.45897 0.24486 0.86028
##                              Pr(>|t|)
## (Intercept)             8.483853e-121
## SexMale                  1.693239e-20
## Age82>82                 2.191982e-08
## rs2069442r1              3.306694e-02
## E4statusE4+              1.826176e-26
## rs2069442r1:E4statusE4+  1.515102e-02
APOE4_CDK5_interaction <- rlist::list.clean(APOE4_CDK5_interaction, recursive = T)
#plot
forest_plot <- as.data.frame(matrix(data = NA, nrow = length(APOE4_CDK5_interaction), ncol = 6))
colnames(forest_plot) <- c("SF", "lower", "upper", "p_value", "Centre", "gene")
for (i in seq_along(APOE4_CDK5_interaction)) {
  results <- APOE4_CDK5_interaction[[i]]
  results <- tail(results, n=1)
  forest_plot[i, 1] <- results[3]
  forest_plot[i, 2] <- results[4]
  forest_plot[i, 3] <- results[5]
  forest_plot[i, 4] <- round(results[6], digits = 2)
  forest_plot[i, 5] <- names(APOE4_CDK5_interaction)[i]
  forest_plot[i, 6] <- "APOE4*CDK5"
}
p <- ggplot(data = forest_plot, aes(x = SF, y = Centre, xmin = lower, xmax = upper))
p <- p + geom_pointrange(aes(col = Centre))
p <- p + geom_vline(aes(fill = Centre), xintercept = 1, linetype = "dotted")
## Warning: geom_vline(): Ignoring `mapping` because `xintercept` was provided.
p <- p + geom_errorbar(aes(xmin = lower, xmax = upper, col = Centre),width = 0.5, cex = 1)
p <- p + facet_wrap(~gene, strip.position = "left", nrow = 1, scales = "free_y")
p <- p + geom_text(aes(label = round(p_value, 3)),hjust = -0.5, vjust = -1)
p <- p+theme(axis.title.y=element_blank(),
             axis.text.y=element_blank(),
             axis.ticks.y=element_blank())
p